Housekeeping

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 461460 24.7    1004631 53.7   627651 33.6
## Vcells 941728  7.2    8388608 64.0  1752011 13.4
### get count data from each superkingdom of the top-entries of the other superkingdoms
extend_top_hits <- function (top_bacteria, top_archaea, d_superkingdom){
  # extend bacteria with entries which appear in top-archaea but not in top-bacteria.
  pfam_bacteria <- top_bacteria
  for (i in 1:nrow(top_archaea)){
    # check for each row in top_archaea if not top_archaea$model[i] in top_bacteria$model[i]
    if (!(top_archaea$model[i] %in% top_bacteria$model)){
      # if not, then append the corresponding PFAM-entry of bacteria
      pfam_bacteria <- rbind(pfam_bacteria, d_superkingdom[which(d_superkingdom$model == top_archaea$model[i] & d_superkingdom$Superkingdom == unique(top_bacteria$Superkingdom)),])
    }
  }
  # pfam_bacteria
  # top_archaea$PFAM_desc %in% top_bacteria$PFAM_desc
  # top_archaea
  # top_bacteria
  
  # Same as above but for vice versa
  pfam_archaea <- top_archaea
  for (i in 1:nrow(top_bacteria)){
    if (!(top_bacteria$model[i] %in% top_archaea$model)){
      pfam_archaea <- rbind(pfam_archaea, d_superkingdom[which(d_superkingdom$model == top_bacteria$model[i] & d_superkingdom$Superkingdom == unique(top_archaea$Superkingdom)),])
    }
  }
  # pfam_archaea
  # top_bacteria$PFAM_desc %in% top_archaea$PFAM_desc
  # top_archaea
  # top_bacteria
  return(list(pfam_bacteria, pfam_archaea))
}

# Create a plotly plot from the usual plot. Save it with htmlwidget
make_it_plotly <- function(df_spec){
  require(plotly)
  
  # Text which is displayed in the text-box on hover over a data point
  df_spec$textlabel <- paste("PFAM-Name: ", df_spec$PFAM_Name, "\n",
                             "PFAM-Desc.: ", df_spec$PFAM_desc, "\n",
                             "PFAM-Acce.: ", df_spec$model)
  # # Pure plotly, no jitter
  # p <- plot_ly(data = df_spec, x = ~Superkingdom, y = ~rank, color = ~PFAM_desc,
  #              type = "scatter", mode = "marker", 
  #              colors = morecolors, 
  #              text = ~textlabel, hoverinfo = 'text') %>%
  #   add_trace(y =~PFAM_desc, mode = "lines") %>%
  #    layout(xaxis = list(side ="top"))
  # htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_overall", ".html"))
  
  # Using ggplotly, with jitter
  pd <- position_dodge(0.4)
  p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc, text = textlabel))+
    geom_point(position = pd)+
    geom_line(aes(group = PFAM_desc), 
              position = pd)+
    scale_x_discrete(position = "top")+
    scale_color_manual(values = morecolors2)+
    labs(x = "Superkingdom",
         y = "Rank of count",
         color = "PFAM Family")+
    theme(legend.position="right",
          plot.margin = margin(l = 0, r=0))+
    guides(color=guide_legend(nrow=20, byrow=TRUE))
  p <- beautifier(p)
  p <- ggplotly(p, tooltip = "text") %>%
    layout(xaxis = list(side ="top",
                        tickangle=45))
  
  # unload plotly, as they mask many objects
  detach("package:plotly", unload = TRUE)
  return(p)
}

Data Loading

TR clustering of PFAM domains

-> see file pfam_clans_analysis.R

First some Background information on how the data was generated.

Annotation of Tandem Repeats regions

(from Msc-thesis of Paulina) First, tandem repeat motifs are either obtained from databases and/or detected with de novo algorithms. As a set for potential TR unit seeds one can use known tandem domains (Schaper et al., 2014). For example, the Pfam database provides many sequence profile hidden markov models (HMM) models of common protein domains (Punta et al., 2012). For repetitive DNA sequences Dfam can be used (Wheeler et al., 2013). Annotation using a known motif allows, for example, to study the evolution of a TR between sequence homologues or to check whether a known motif of a TR occurs in a sequence (Schaper et al., 2015). From the Pfam or Dfam domains, circular profile HMMs can be constructed, which, unlike standard HMMs, allow transitions between the last and the first match state. These circular connections in the HMM allow one TR unit to be appended directly to the next and any number of TR units to follow in succession. TRs corresponding to the entries in Pfam/Dfam can then be annotated using the Viterbi algorithm applied to the circular profiles HMM. The result of the Viterbi path is the path through the cpHMM which best describes the sequence and is interpreted as an emission instance of the model. The Viterbi path divides the sequence into the flanking sequence and the TR sequence. Then all TR-units are reconstructed, where TR unit breaks are inserted between the kth and k+1th consenus positions. k is chosen to minimize the distance between the break and the first consensus state and also the distance between the break and the last consensus state (Schaper et al., 2014).
Since short or rare TRs are rather not available on the databases, TRs can also be predicted by de novo TR detections. For this purpose, TRAL integrates various external software. Due to the significant differences in the algorithms, and the often incoherent results of the individual TRDs, it makes sense to use the results of different TRDs together. TRAL therefore represents a meta-TRD to construct a set of TRs as complete as possible (Schaper et al., 2012). The following TRDs are currently integrated in TRAL: HHrepID (Biegert and S¨oding, 2008), Phobos (Mayer, 2006), TRED (Sokol et al., 2007), T-REKS (Jorda and Kajava, 2009), TRF (Benson, 1999), TRUST (Szklarczyk and Heringa, 2004), and XSTREAM (Newman and Cooper, 2007). An automatic sanity check discards detected TRs which were not part of the input sequence. From the de novo TRs, profile HMM can be created (e.g. with HMMER (Eddy, 1995)), which then can be further refined with circular profile HMMs. This is because a TRD may have found a TR correctly, but did not capture all its units or set the TR bounderies incorrectly (Schaper et al., 2015).

PFAM-family added

The protein sequences from swissrepeat were analysed for TR regions using HMMs from PFAM. If they were not available for certain proteins, other TR detectors were used.
ASK PAULINA IF POSSIBLE. Or are all proteins analysed de-novo and with PFAM and then, after stat. sign. testing, the TR with the highest propbability was selected to be true?

unique(tr_all$TRD)
## [1] HHrepID XSTREAM PFAM    T-REKS 
## Levels: HHrepID PFAM T-REKS XSTREAM

Namely, HHrepID, XSTREAM or T-REKS.

nrow(tr_all_sp_sub)/nrow(tr_all_sp)
## [1] 0.04630184

From all detected TRs in swissprot, 4.5% were retrieved with a PFAM model. (TODO: is this correctly (“retrieved”)?)
Therefore only a small portion of all available proteins was possible to cluster into their protein families and clans.

For the following analysis, we considered only TRs in proteins which were associated with a PFAM model (PFAM-accession number) and only those that we could add their PFAM-name and description.

Distribution of PFAM domain annotations in the superkingdoms, kingdoms, and across mitochondiral and chloroplastic genes (~ Fig4. Marcotte et al.).

The TR were then summarized by their Superkingdom as well as for their Kingdom, their PFAM-Name, PFAM-description and if they have endosymbiotic origin.

# Summarize by Superkingdom
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model, is_chloroplastic, is_mitochondrial),
          summarize,
          count=length(ID))
# d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
#           summarize, 
#           count=length(ID))

# Summarize by Kingdom
d_kingdom = ddply(tr_all_sp_sub, .(Superkingdom, Kingdom, PFAM_Name, PFAM_desc, model, is_chloroplastic, is_mitochondrial), 
          summarize, 
          count=length(ID)) 

# Select Top-10 hits
TOP <- 10

# d_tabled <- as.data.frame(table(d$PFAM_desc))
# d_tabled[order(d_tabled$Freq, decreasing = TRUE)[1:TOP],]

We clearly see, that the most TRs associated with a PFAM-model were found in eukaryotes followed by bacteria, archaea and viruses.
Since many model organisms are from bacterial or eukaryotic origin, we assume that there might be a bias because of better curation and more entries in the databases for such organisms.
For this reasons, after we counted the TRs (associated with a PFAM-model) grouped by the above mentioned factors and ordered them in decreasing order. From those, the top 10 were selected and ranked. This allows the comparison of the amount of TRs across the superkingdoms on the same scale.

Top-10 Bacterial PFAM-Families

top_bacteria = subset(d_superkingdom, Superkingdom=="Bacteria")
(top_bacteria <- top_bacteria[order(top_bacteria$count,decreasing=T)[1:TOP],])
##     Superkingdom       PFAM_Name
## 70      Bacteria         Hexapep
## 448     Bacteria            MraZ
## 107     Bacteria    Ribosomal_L6
## 814     Bacteria    NTP_transf_3
## 880     Bacteria       Hexapep_2
## 657     Bacteria            PD40
## 864     Bacteria Acetyltransf_11
## 559     Bacteria            LpxD
## 548     Bacteria          TolB_N
## 541     Bacteria   DNA_gyraseA_C
##                                                            PFAM_desc
## 70                   Bacterial transferase hexapeptide (six repeats)
## 448                            MraZ protein, putative antitoxin-like
## 107                                             Ribosomal protein L6
## 814                                 MobA-like NTP transferase domain
## 880                       Hexapeptide repeat of succinyl-transferase
## 657                                  WD40-like Beta Propeller Repeat
## 864              Udp N-acetylglucosamine O-acyltransferase; Domain 2
## 559 UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase, LpxD
## 548                                       TolB amino-terminal domain
## 541                     DNA gyrase C-terminal domain, beta-propeller
##       model is_chloroplastic is_mitochondrial count
## 70  PF00132            FALSE            FALSE   928
## 448 PF02381            FALSE            FALSE   320
## 107 PF00347            FALSE            FALSE   317
## 814 PF12804            FALSE            FALSE   244
## 880 PF14602            FALSE            FALSE   223
## 657 PF07676            FALSE            FALSE   164
## 864 PF13720            FALSE            FALSE   158
## 559 PF04613            FALSE            FALSE   127
## 548 PF04052            FALSE            FALSE   115
## 541 PF03989            FALSE            FALSE   100
  1. Bacterial transferase hexapeptide (six repeats) (similar to Hexapeptide repeat of succinyl-transferase)
    A variety of bacterial transferases contain a repeat structure composed of tandem repeats of a [LIV]-G-X(4) hexapeptide, which, in the tertiary structure of LpxA (UDP N-acetylglucosamine acyltransferase) [PUBMED:7481807], has been shown to form a left-handed parallel beta helix. A number of different transferase protein families contain this repeat, such as
    galactoside acetyltransferase-like proteins [PUBMED:11937062]: The galactoside acetyltransferase (thiogalactoside transacetylase) of Escherichia coli (GAT, LacA, EC 2.3.1.18) is a gene product of the classical lac operon. GAT may assist cellular detoxification by acetylating nonmetabolizable pyranosides, thereby preventing their reentry into the cell.
    the gamma-class of carbonic anhydrases [PUBMED:10924115]: Carbonic anhydrases (CA: EC:4.2.1.1) are zinc metalloenzymes which catalyse the reversible hydration of carbon dioxide to bicarbonate [PMID: 18336305, PMID: 10978542].
    and tetrahydrodipicolinate-N-succinlytransferases (DapD) the latter containing an extra N-terminal 3-helical domain [PUBMED:11910040]: Tetrahydrodipicolinate N-succinyltransferase (DapD) catalyzes the succinyl-CoA-dependent acylation of L-2-amino-6-oxopimelate to 2-N-succinyl-6-oxopimelate as part of the succinylase branch of the meso-diaminopimelate/lysine biosynthetic pathway of bacteria, blue-green algae, and plants. This pathway provides meso-diaminopimelate as a building block for cell wall peptidoglycan in most bacteria, and is regarded as a target pathway for antibacterial agents.
  2. MraZ protein, putative antitoxin-like
    This small 70 amino acid domain is found duplicated in a family of bacterial proteins. These proteins may be DNA-binding transcription factors (Pers. comm. A Andreeva & A Murzin). It is likely, due to the similarity of fold, that this family acts as a bacterial antitoxin like the MazE antitoxin family.
  3. Ribosomal protein L6
    Ribosomes are the particles that catalyse mRNA-directed protein synthesis in all organisms. The codons of the mRNA are exposed on the ribosome to allow tRNA binding. This leads to the incorporation of amino acids into the growing polypeptide chain in accordance with the genetic information. Incoming amino acid monomers enter the ribosomal A site in the form of aminoacyl-tRNAs complexed with elongation factor Tu (EF-Tu) and GTP. The growing polypeptide chain, situated in the P site as peptidyl-tRNA, is then transferred to aminoacyl-tRNA and the new peptidyl-tRNA, extended by one residue, is translocated to the P site with the aid the elongation factor G (EF-G) and GTP as the deacylated tRNA is released from the ribosome through one or more exit sites [PUBMED:11297922, PUBMED:11290319]. About 2/3 of the mass of the ribosome consists of RNA and 1/3 of protein. The proteins are named in accordance with the subunit of the ribosome which they belong to - the small (S1 to S31) and the large (L1 to L44). Usually they decorate the rRNA cores of the subunits. Many ribosomal proteins, particularly those of the large subunit, are composed of a globular, surfaced-exposed domain with long finger-like projections that extend into the rRNA core to stabilise its structure. Most of the proteins interact with multiple RNA elements, often from different domains. In the large subunit, about 1/3 of the 23S rRNA nucleotides are at least in van der Waal’s contact with protein, and L22 interacts with all six domains of the 23S rRNA. Proteins S4 and S7, which initiate assembly of the 16S rRNA, are located at junctions of five and four RNA helices, respectively. In this way proteins serve to organise and stabilise the rRNA tertiary structure. While the crucial activities of decoding and peptide transfer are RNA based, proteins play an active role in functions that may have evolved to streamline the process of protein synthesis. In addition to their function in the ribosome, many ribosomal proteins have some function ‘outside’ the ribosome [PUBMED:11290319, PUBMED:11114498].
    L6 is a protein from the large (50S) subunit. In Escherichia coli, it is located in the aminoacyl-tRNA binding site of the peptidyltransferase centre, and is known to bind directly to 23S rRNA. It belongs to a family of ribosomal proteins, including L6 from bacteria, cyanelles (structures that perform similar functions to chloroplasts, but have structural and biochemical characteristics of Cyanobacteria) and mitochondria; and L9 from mammals, Drosophila, plants and yeast. L6 contains two domains with almost identical folds, suggesting that is was derived by the duplication of an ancient RNA-binding protein gene. Analysis reveals several sites on the protein surface where interactions with other ribosome components may occur, the N terminus being involved in protein-protein interactions and the C terminus containing possible RNA-binding sites [PUBMED:8262035]. This entry represents the alpha-beta domain found duplicated in ribosomal L6 proteins. This domain consists of two beta-sheets and one alpha-helix packed around single core [PUBMED:8262035].
  4. MobA-like NTP transferase domain
    This family includes the MobA protein (Molybdopterin-guanine dinucleotide biosynthesis protein A). The family also includes a wide range of other NTP transferase domain: The biosynthesis of disaccharides, oligosaccharides and polysaccharides involves the action of hundreds of different glycosyltransferases. These enzymes catalyse the transfer of sugar moieties from activated donor molecules to specific acceptor molecules, forming glycosidic bonds. A classification of glycosyltransferases using nucleotide diphospho-sugar, nucleotide monophospho-sugar and sugar phosphates (EC:2.4.1.-) and related proteins into distinct sequence based families has been described [PMID: 9334165].
  5. Hexapeptide repeat of succinyl-transferase (similar to Bacterial transferase hexapeptide (six repeats))
    A variety of bacterial transferases contain a repeat structure composed of tandem repeats of a [LIV]-G-X(4) hexapeptide, which, in the tertiary structure of LpxA (UDP N-acetylglucosamine acyltransferase) [PUBMED:7481807], has been shown to form a left-handed parallel beta helix. LpxA is the first enzyme in the lipid A biosynthetic pathway and is a target for the design of antibiotics. A number of different transferase protein families contain this repeat, such as:
    galactoside acetyltransferase-like proteins [PUBMED:11937062] of Escherichia coli (GAT, LacA, EC 2.3.1.18) is a gene product of the classical lac operon. GAT may assist cellular detoxification by acetylating nonmetabolizable pyranosides, thereby preventing their reentry into the cell.
    gamma-class of carbonic anhydrases [PUBMED:10924115] arbonic anhydrases (CA: EC:4.2.1.1) are zinc metalloenzymes which catalyse the reversible hydration of carbon dioxide to bicarbonate [PMID: 18336305, PMID: 10978542].
    tetrahydrodipicolinate-N-succinlytransferases (DapD), the latter containing an extra N-terminal 3-helical domain [PUBMED:11910040]: Tetrahydrodipicolinate N-succinyltransferase (DapD) catalyzes the succinyl-CoA-dependent acylation of L-2-amino-6-oxopimelate to 2-N-succinyl-6-oxopimelate as part of the succinylase branch of the meso-diaminopimelate/lysine biosynthetic pathway of bacteria, blue-green algae, and plants. This pathway provides meso-diaminopimelate as a building block for cell wall peptidoglycan in most bacteria, and is regarded as a target pathway for antibacterial agents.
  6. WD40-like Beta Propeller Repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. This region appears to be related to the INTERPRO repeat. This model is likely to miss copies within a sequence.
    One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14.
  7. Udp N-acetylglucosamine O-acyltransferase; Domain 2
    This is the C-terminal domain of UDP N-acetylglucosamine O-acyltransferase. This enzyme is a zinc-dependent enzyme that catalyses the deacetylation of UDP-3-O-((R)-3-hydroxymyristoyl)-N-acetylglucosamine to form UDP-3-O-(R-hydroxymyristoyl)glucosamine and acetate [PUBMED:15705580].
    UDP is an important factor in glycogenesis. Before glucose can be stored as glycogen in the liver and muscles, the enzyme UDP-glucose pyrophosphorylase forms a UDP-glucose unit by combining glucose 1-phosphate with uridine triphosphate, cleaving a pyrophosphate ion in the process. Then, the enzyme glycogen synthase combines UDP-glucose units to form a glycogen chain. The UDP molecule is cleaved from the glucose ring during this process and can be reused by UDP-glucose pyrophosphorylase.[1][2]
  8. UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase, LpxD
    UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase (EC 2.3.1.-) catalyses an early step in lipid A biosynthesis: UDP-3-O-(3-hydroxytetradecanoyl)glucosamine + (R)-3-hydroxytetradecanoyl- [acyl carrier protein] -> UDP-2,3-bis(3-hydroxytetradecanoyl)glucosamine + [acyl carrier protein] [1]. Members of this family also contain a hexapeptide repeat (PF00132). This family constitutes the non-repeating region of LPXD proteins.
  9. TolB amino-terminal domain
    TolB is a periplasmic protein from Escherichia coli that is part of the Tol-dependent translocation system involving group A and E colicins that is used to penetrate and kill cells [PUBMED:10545334, PUBMED:10673426]. TolB is required for lethal infection by Salmonella typhimurium in mice. Colicins are released into the environment to reduce competition from other bacterial strains. Colicins bind to outer membrane receptors, using them to translocate to the cytoplasm or cytoplasmic membrane, where they exert their cytotoxic effect, including depolarisation of the cytoplasmic membrane, DNase activity, RNase activity, or inhibition of murein synthesis.
    TolB has two domains, an alpha-helical N-terminal domain that shares structural similarity with the C-terminal domain of transfer RNA ligases, and a beta-propeller C-terminal domain (INTERPRO) that shares structural similarity with numerous members of the prolyl oligopeptidase family and, to a lesser extent, to class B metallo-beta-lactamases [PUBMED:10545334]. The function of the N-terminal domain is uncertain.
    The Tol-Pal complex of Escherichia coli is composed of five proteins that interact in the cell envelope. The TolA, TolQ, and TolR cytoplasmic membrane proteins interact with each other through their transmembrane segments (23, 35). The outer membrane-anchored Pal lipoprotein interacts with the periplasmic TolB protein (8). A link between inner and outer membranes is mediated by the interaction of the TolA C-terminal domain with Pal and TolB.
  10. DNA gyrase C-terminal domain, beta-propeller
    This repeat is found as 6 tandem copies at the C-termini of GyrA and ParC DNA gyrases. It is predicted to form 4 beta strands and to probably form a beta-propeller structure [PUBMED:11948780]. This region has been shown to bind DNA non-specifically and may stabilise the DNA-topoisomerase complex [PUBMED:1657531].
    DNA gyrase, or simply gyrase, is an enzyme within the class of topoisomerase and is a subclass of Type II topoisomerases[1] that reduces topological strain in an ATP dependent manner while double-stranded DNA is being unwound by elongating RNA-polymerase [2] or by helicase in front of the progressing replication fork.

SUMMARY:
The bacterial TR regions with annotated protein families, seem to belong to proteins most likely beeing unique or very distinct to bacterias.

Top-10 Archaeal PFAM-Families

top_archaea = subset(d_superkingdom, Superkingdom=="Archaea")
(top_archaea <- top_archaea[order(top_archaea$count,decreasing=T)[1:TOP],])
##     Superkingdom     PFAM_Name
## 117      Archaea         TFIIB
## 182      Archaea           CBS
## 26       Archaea          Fer4
## 816      Archaea        Fer4_7
## 878      Archaea   LAGLIDADG_3
## 72       Archaea       Hexapep
## 700      Archaea  TF_Zn_Ribbon
## 106      Archaea  Ribosomal_L6
## 553      Archaea Rad50_zn_hook
## 846      Archaea       Fer4_10
##                                           PFAM_desc   model
## 117               Transcription factor TFIIB repeat PF00382
## 182                                      CBS domain PF00571
## 26                            4Fe-4S binding domain PF00037
## 816                         4Fe-4S dicluster domain PF12838
## 878                           LAGLIDADG-like domain PF14528
## 72  Bacterial transferase hexapeptide (six repeats) PF00132
## 700                              TFIIB zinc-binding PF08271
## 106                            Ribosomal protein L6 PF00347
## 553                           Rad50 zinc hook motif PF04423
## 846                         4Fe-4S dicluster domain PF13237
##     is_chloroplastic is_mitochondrial count
## 117            FALSE            FALSE    35
## 182            FALSE            FALSE    22
## 26             FALSE            FALSE    16
## 816            FALSE            FALSE    13
## 878            FALSE            FALSE    11
## 72             FALSE            FALSE     9
## 700            FALSE            FALSE     9
## 106            FALSE            FALSE     7
## 553            FALSE            FALSE     7
## 846            FALSE            FALSE     7
(top_prokaryota <- rbind(top_archaea, top_bacteria))
##     Superkingdom       PFAM_Name
## 117      Archaea           TFIIB
## 182      Archaea             CBS
## 26       Archaea            Fer4
## 816      Archaea          Fer4_7
## 878      Archaea     LAGLIDADG_3
## 72       Archaea         Hexapep
## 700      Archaea    TF_Zn_Ribbon
## 106      Archaea    Ribosomal_L6
## 553      Archaea   Rad50_zn_hook
## 846      Archaea         Fer4_10
## 70      Bacteria         Hexapep
## 448     Bacteria            MraZ
## 107     Bacteria    Ribosomal_L6
## 814     Bacteria    NTP_transf_3
## 880     Bacteria       Hexapep_2
## 657     Bacteria            PD40
## 864     Bacteria Acetyltransf_11
## 559     Bacteria            LpxD
## 548     Bacteria          TolB_N
## 541     Bacteria   DNA_gyraseA_C
##                                                            PFAM_desc
## 117                                Transcription factor TFIIB repeat
## 182                                                       CBS domain
## 26                                             4Fe-4S binding domain
## 816                                          4Fe-4S dicluster domain
## 878                                            LAGLIDADG-like domain
## 72                   Bacterial transferase hexapeptide (six repeats)
## 700                                               TFIIB zinc-binding
## 106                                             Ribosomal protein L6
## 553                                            Rad50 zinc hook motif
## 846                                          4Fe-4S dicluster domain
## 70                   Bacterial transferase hexapeptide (six repeats)
## 448                            MraZ protein, putative antitoxin-like
## 107                                             Ribosomal protein L6
## 814                                 MobA-like NTP transferase domain
## 880                       Hexapeptide repeat of succinyl-transferase
## 657                                  WD40-like Beta Propeller Repeat
## 864              Udp N-acetylglucosamine O-acyltransferase; Domain 2
## 559 UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase, LpxD
## 548                                       TolB amino-terminal domain
## 541                     DNA gyrase C-terminal domain, beta-propeller
##       model is_chloroplastic is_mitochondrial count
## 117 PF00382            FALSE            FALSE    35
## 182 PF00571            FALSE            FALSE    22
## 26  PF00037            FALSE            FALSE    16
## 816 PF12838            FALSE            FALSE    13
## 878 PF14528            FALSE            FALSE    11
## 72  PF00132            FALSE            FALSE     9
## 700 PF08271            FALSE            FALSE     9
## 106 PF00347            FALSE            FALSE     7
## 553 PF04423            FALSE            FALSE     7
## 846 PF13237            FALSE            FALSE     7
## 70  PF00132            FALSE            FALSE   928
## 448 PF02381            FALSE            FALSE   320
## 107 PF00347            FALSE            FALSE   317
## 814 PF12804            FALSE            FALSE   244
## 880 PF14602            FALSE            FALSE   223
## 657 PF07676            FALSE            FALSE   164
## 864 PF13720            FALSE            FALSE   158
## 559 PF04613            FALSE            FALSE   127
## 548 PF04052            FALSE            FALSE   115
## 541 PF03989            FALSE            FALSE   100
  1. Transcription factor TFIIB repeat
    In eukaryotes, transcription initiation of all protein encoding genes involves the polymerase II system. This sytem is modulated by both general and specific transcription factors. The general factors (which include TFIIA, TFIIB, TFIID, TFIIE, TFIIF, TFIIG and TFIIH) operate through common promoter elements, such as the TATA box. Transcription factor IIB (TFIIB) is of central importance in transcription of class II genes. It associates with TFIID-TFIIA bound to DNA (the DA complex) to form a ternary TFIID-IIA-IBB (DAB) complex, which is recognised by RNA polymerase II [PUBMED:1876184, PUBMED:1949150]. TFIIB comprises ~315-340 residues and contains an imperfect C-terminal repeat of a 75-residue domain that may contribute to the symmetry of the folded protein. The basal archaeal transcription machinery resembles that of the eukaryotic polymerase II system and includes a homologue of TFIIB [PUBMED:7597027].
    This entry represents a cyclin-like domain which is found repeated in the C-terminal region of a variety of eukaryotic TFIIB’s and their archaeal counterparts. These domains individually form the typical cyclin fold, and in the transcription complex they straddle the C-terminal region of the TATA-binding protein - an interaction essential for the formation of the transcription initiation complex [PUBMED:9177165, PUBMED:10619841].
  2. CBS domain
    In molecular biology, the CBS domain is a protein domain found in a range of proteins in all species from bacteria to humans. It was first identified as a conserved sequence region in 1997 and named after cystathionine beta synthase, one of the proteins it is found in.[2] CBS domains are also found in a wide variety of other proteins such as inosine monophosphate dehydrogenase,[3] voltage gated chloride channels[4][5][6][7][8] and AMP-activated protein kinase (AMPK).[9][10] CBS domains regulate the activity of associated enzymatic and transporter domains in response to binding molecules with adenosyl groups such as AMP and ATP, or s-adenosylmethionine.[11]
    Mutations in some human CBS domain-containing proteins leads to genetic diseases.[3] For example, mutations in the cystathionine-beta-synthase protein lead to an inherited disorder of the metabolism called homocystinuria (OMIM: 236200).[29] Mutations in the gamma subunit of the AMPK enzyme have been shown to lead to familial hypertrophic cardiomyopathy with Wolff-Parkinson-White syndrome (OMIM: 600858). Mutations in the CBS domains of the IMPDH enzyme lead to the eye condition retinitis pigmentosa (OMIM: 180105). Humans have a number of voltage-gated chloride channel genes, and mutations in the CBS domains of several of these have been identified as the cause of genetic diseases. Mutations in CLCN1 lead to myotonia (OMIM: 160800),[30] mutations in CLCN2 can lead to idiopathic generalised epilepsy (OMIM: 600699), mutations in CLCN5 can lead to Dent’s disease (OMIM: 300009), mutations in CLCN7 can lead to osteopetrosis (OMIM: 259700),[31] and mutations in CLCNKB can lead to Bartter syndrome (OMIM: 241200).
  3. 4Fe-4S binding domain
    Ferredoxins are a group of iron-sulphur proteins which mediate electron transfer in a wide variety of metabolic reactions. Ferredoxins can be divided into several subgroups depending upon the physiological nature of the iron-sulphur cluster(s). One of these subgroups are the 4Fe-4S ferredoxins, which are found in bacteria and which are thus often referred as ‘bacterial-type’ ferredoxins. The structure of these proteins [PUBMED:3129571] consists of the duplication of a domain of twenty six amino acid residues; each of these domains contains four cysteine residues that bind to a 4Fe-4S centre. Several structures of the 4Fe-4S ferredoxin domain have been determined [PUBMED:7966291]. The clusters consist of two interleaved 4Fe- and 4S-tetrahedra forming a cubane-like structure, in such a way that the four iron occupy the eight corners of a distorted cube. Each 4Fe-4S is attached to the polypeptide chain by four covalent Fe-S bonds involving cysteine residues. A number of proteins have been found [PUBMED:2185975] that include one or more 4Fe-4S binding domains similar to those of bacterial-type ferredoxins. The pattern of cysteine residues in the iron-sulphur region is sufficient to detect this class of 4Fe-4S binding proteins. This entry represents the whole domain. Note:In some bacterial ferredoxins, one of the two duplicated domains has lost one or more of the four conserved cysteines. The consequence of such variations is that these domains have either lost their iron-sulphur binding property or bind to a 3Fe-3S centre instead of a 4Fe-4S centre.
  4. 4Fe-4S dicluster domain
    Ferredoxins are a group of iron-sulphur proteins which mediate electron transfer in a wide variety of metabolic reactions. Ferredoxins can be divided into several subgroups depending upon the physiological nature of the iron-sulphur cluster(s). One of these subgroups are the 4Fe-4S ferredoxins, which are found in bacteria and which are thus often referred as ‘bacterial-type’ ferredoxins. The structure of these proteins [PUBMED:3129571] consists of the duplication of a domain of twenty six amino acid residues; each of these domains contains four cysteine residues that bind to a 4Fe-4S centre. Several structures of the 4Fe-4S ferredoxin domain have been determined [PUBMED:7966291]. The clusters consist of two interleaved 4Fe- and 4S-tetrahedra forming a cubane-like structure, in such a way that the four iron occupy the eight corners of a distorted cube. Each 4Fe-4S is attached to the polypeptide chain by four covalent Fe-S bonds involving cysteine residues. A number of proteins have been found [PUBMED:2185975] that include one or more 4Fe-4S binding domains similar to those of bacterial-type ferredoxins. The pattern of cysteine residues in the iron-sulphur region is sufficient to detect this class of 4Fe-4S binding proteins. This entry represents the whole domain. Note:In some bacterial ferredoxins, one of the two duplicated domains has lost one or more of the four conserved cysteines. The consequence of such variations is that these domains have either lost their iron-sulphur binding property or bind to a 3Fe-3S centre instead of a 4Fe-4S centre.
  5. LAGLIDADG-like domain
    Homing endonucleases (HEnases) form a large and highly diverse class of proteins encoded by introns and inteins that confer mobility to their host genetic elements. LAGLIDADG HEnases are structured into two tandemly repeated homing endonuclease-like domains [PUBMED:17603302, PUBMED:8918801]. This entry represents the homing endonuclease LAGLIDADG domain [PUBMED:9358175]. These endonucleases have been shown to occur in different environments: LAGLIDADG endonucleases are found in inteins, archaeal and group I introns and as free standing open reading frames (ORFs); HNH endonucleases occur in group I and group II introns and as ORFs. hylogenetic analysis of the two families indicates a lack of exchange of endonucleases between different mobile elements (environments) and between hosts from different phylogenetic kingdoms. However, there does appear to have been considerable exchange of endonuclease domains amongst elements of the same type. Such events are suggested to be important for the formation of elements of new specficity [PMID: 9358175]. These fast evolving enzymes catalyze site specific, double-stranded breaks in intron/intein-less alleles. In the strand exchange during the repair process, the intron/intein encoding homing endonuclease is incorporated into the previously intron/intein-free allele, thus promoting the survival of this selfish genetic element.2 Homing endonucleases can be divided into several distinct groups; including LAGLIDADG, GIY-YIG, HNH and His-Cys box families.3 LAGLIDADG HEnases are one of the largest families of homing enzymes present in all kingdoms of life.1 A reason behind the wide distribution of LAGLIDADG HEnases is their apparent ability to invade unrelated types of intervening sequences such as group I introns, archeal introns and inteins.
  6. Bacterial transferase hexapeptide (six repeats)
    A variety of bacterial transferases contain a repeat structure composed of tandem repeats of a [LIV]-G-X(4) hexapeptide, which, in the tertiary structure of LpxA (UDP N-acetylglucosamine acyltransferase) [PUBMED:7481807], has been shown to form a left-handed parallel beta helix. A number of different transferase protein families contain this repeat, such as
    galactoside acetyltransferase-like proteins [PUBMED:11937062]: The galactoside acetyltransferase (thiogalactoside transacetylase) of Escherichia coli (GAT, LacA, EC 2.3.1.18) is a gene product of the classical lac operon. GAT may assist cellular detoxification by acetylating nonmetabolizable pyranosides, thereby preventing their reentry into the cell.
    the gamma-class of carbonic anhydrases [PUBMED:10924115]: Carbonic anhydrases (CA: EC:4.2.1.1) are zinc metalloenzymes which catalyse the reversible hydration of carbon dioxide to bicarbonate [PMID: 18336305, PMID: 10978542].
    and tetrahydrodipicolinate-N-succinlytransferases (DapD) the latter containing an extra N-terminal 3-helical domain [PUBMED:11910040]: Tetrahydrodipicolinate N-succinyltransferase (DapD) catalyzes the succinyl-CoA-dependent acylation of L-2-amino-6-oxopimelate to 2-N-succinyl-6-oxopimelate as part of the succinylase branch of the meso-diaminopimelate/lysine biosynthetic pathway of bacteria, blue-green algae, and plants. This pathway provides meso-diaminopimelate as a building block for cell wall peptidoglycan in most bacteria, and is regarded as a target pathway for antibacterial agents.
  7. TFIIB zinc-binding
    The transcription factor TFIIB contains a zinc-binding motif near the N-terminus. This domain is involved in the interaction with RNA pol II and TFIIF and plays a crucial role in selecting the transcription initiation site. The domain adopts a zinc ribbon like structure [1].
  8. 4Fe-4S dicluster domain
    This family includes proteins containing domains which bind to iron-sulfur clusters. Members include bacterial ferredoxins, various dehydrogenases, and various reductases. The structure of the domain is an alpha-antiparallel beta sandwich.
    Ferredoxins are a group of iron-sulphur proteins which mediate electron transfer in a wide variety of metabolic reactions. Ferredoxins can be divided into several subgroups depending upon the physiological nature of the iron-sulphur cluster(s). One of these subgroups are the 4Fe-4S ferredoxins, which are found in bacteria and which are thus often referred as ‘bacterial-type’ ferredoxins. The structure of these proteins [PUBMED:3129571] consists of the duplication of a domain of twenty six amino acid residues; each of these domains contains four cysteine residues that bind to a 4Fe-4S centre. Several structures of the 4Fe-4S ferredoxin domain have been determined [PUBMED:7966291]. The clusters consist of two interleaved 4Fe- and 4S-tetrahedra forming a cubane-like structure, in such a way that the four iron occupy the eight corners of a distorted cube. Each 4Fe-4S is attached to the polypeptide chain by four covalent Fe-S bonds involving cysteine residues. A number of proteins have been found [PUBMED:2185975] that include one or more 4Fe-4S binding domains similar to those of bacterial-type ferredoxins. The pattern of cysteine residues in the iron-sulphur region is sufficient to detect this class of 4Fe-4S binding proteins. This entry represents the whole domain. Note:In some bacterial ferredoxins, one of the two duplicated domains has lost one or more of the four conserved cysteines. The consequence of such variations is that these domains have either lost their iron-sulphur binding property or bind to a 3Fe-3S centre instead of a 4Fe-4S centre.
  9. Rad50 zinc hook motif
    The Mre11 complex (Mre11 Rad50 Nbs1) is central to chromosomal maintenance and functions in homologous recombination, telomere maintenance and sister chromatid association. The Rad50 coiled-coil region contains a dimer interface at the apex of the coiled coils in which pairs of conserved Cys-X-X-Cys motifs form interlocking hooks that bind one Zn ion. This alignment includes the zinc hook motif and a short stretch of coiled-coil on either side.
  10. Ribosomal protein L6
    Ribosomes are the particles that catalyse mRNA-directed protein synthesis in all organisms. The codons of the mRNA are exposed on the ribosome to allow tRNA binding. This leads to the incorporation of amino acids into the growing polypeptide chain in accordance with the genetic information. Incoming amino acid monomers enter the ribosomal A site in the form of aminoacyl-tRNAs complexed with elongation factor Tu (EF-Tu) and GTP. The growing polypeptide chain, situated in the P site as peptidyl-tRNA, is then transferred to aminoacyl-tRNA and the new peptidyl-tRNA, extended by one residue, is translocated to the P site with the aid the elongation factor G (EF-G) and GTP as the deacylated tRNA is released from the ribosome through one or more exit sites [PUBMED:11297922, PUBMED:11290319]. About 2/3 of the mass of the ribosome consists of RNA and 1/3 of protein. The proteins are named in accordance with the subunit of the ribosome which they belong to - the small (S1 to S31) and the large (L1 to L44). Usually they decorate the rRNA cores of the subunits. Many ribosomal proteins, particularly those of the large subunit, are composed of a globular, surfaced-exposed domain with long finger-like projections that extend into the rRNA core to stabilise its structure. Most of the proteins interact with multiple RNA elements, often from different domains. In the large subunit, about 1/3 of the 23S rRNA nucleotides are at least in van der Waal’s contact with protein, and L22 interacts with all six domains of the 23S rRNA. Proteins S4 and S7, which initiate assembly of the 16S rRNA, are located at junctions of five and four RNA helices, respectively. In this way proteins serve to organise and stabilise the rRNA tertiary structure. While the crucial activities of decoding and peptide transfer are RNA based, proteins play an active role in functions that may have evolved to streamline the process of protein synthesis. In addition to their function in the ribosome, many ribosomal proteins have some function ‘outside’ the ribosome [PUBMED:11290319, PUBMED:11114498].
    L6 is a protein from the large (50S) subunit. In Escherichia coli, it is located in the aminoacyl-tRNA binding site of the peptidyltransferase centre, and is known to bind directly to 23S rRNA. It belongs to a family of ribosomal proteins, including L6 from bacteria, cyanelles (structures that perform similar functions to chloroplasts, but have structural and biochemical characteristics of Cyanobacteria) and mitochondria; and L9 from mammals, Drosophila, plants and yeast. L6 contains two domains with almost identical folds, suggesting that is was derived by the duplication of an ancient RNA-binding protein gene. Analysis reveals several sites on the protein surface where interactions with other ribosome components may occur, the N terminus being involved in protein-protein interactions and the C terminus containing possible RNA-binding sites [PUBMED:8262035]. This entry represents the alpha-beta domain found duplicated in ribosomal L6 proteins. This domain consists of two beta-sheets and one alpha-helix packed around single core [PUBMED:8262035].

SUMMARY:
Many proteins which are part of the reproductive system and of housekeeping such as transcription.

Top-10 Viral PFAM-Families

top_virus = subset(d_superkingdom, Superkingdom=="Viruses")
(top_virus <- top_virus[order(top_virus$count,decreasing=T)[1:TOP],])
##     Superkingdom   PFAM_Name
## 62       Viruses     zf-CCHC
## 168      Viruses     Gag_p17
## 49       Viruses         RVP
## 16       Viruses         Ank
## 169      Viruses  Adeno_knob
## 193      Viruses Adeno_shaft
## 229      Viruses         rve
## 221      Viruses         BTB
## 46       Viruses     RNase_H
## 51       Viruses       Sushi
##                                          PFAM_desc   model
## 62                                    Zinc knuckle PF00098
## 168          gag gene protein p17 (matrix protein) PF00540
## 49                    Retroviral aspartyl protease PF00077
## 16                                  Ankyrin repeat PF00023
## 169         Adenoviral fibre protein (knob domain) PF00541
## 193 Adenoviral fibre protein (repeat/shaft region) PF00608
## 229                          Integrase core domain PF00665
## 221                                 BTB/POZ domain PF00651
## 46                                         RNase H PF00075
## 51                       Sushi repeat (SCR repeat) PF00084
##     is_chloroplastic is_mitochondrial count
## 62             FALSE            FALSE    56
## 168            FALSE            FALSE    37
## 49             FALSE            FALSE    13
## 16             FALSE            FALSE    11
## 169            FALSE            FALSE    11
## 193            FALSE            FALSE    11
## 229            FALSE            FALSE    11
## 221            FALSE            FALSE    10
## 46             FALSE            FALSE     9
## 51             FALSE            FALSE     9
  1. Zinc knuckle
    The zinc knuckle is a zinc binding motif composed of the the following CX2CX4HX4C where X can be any amino acid. The motifs are mostly from retroviral gag proteins (nucleocapsid). Prototype structure is from HIV. Also contains members involved in eukaryotic gene regulation, such as C. elegans GLH-1 (RNA Helicase, regulate the formation of the granular structure of P-granules in embryos). Structure is an 18-residue zinc finger.
  2. gag gene protein p17 (matrix protein)
    Retroviral matrix proteins (or major core proteins) are components of envelope-associated capsids, which line the inner surface of virus envelopes and are associated with viral membranes [PUBMED:9657938]. Matrix proteins are produced as part of Gag precursor polyproteins. During viral maturation, the Gag polyprotein is cleaved into major structural proteins by the viral protease, yielding the matrix (MA), capsid (CA), nucleocapsid (NC), and some smaller peptides. Gag-derived proteins govern the entire assembly and release of the virus particles, with matrix proteins playing key roles in Gag stability, capsid assembly, transport and budding. Although matrix proteins from different retroviruses appear to perform similar functions and can have similar structural folds, their primary sequences can be very different. This entry represents matrix proteins from immunodeficiency lentiviruses, such as Human immunodeficiency virus (HIV) and Simian immunodeficiency virus (SIV-cpz) [PUBMED:12465460]. The structure of the HIV protein consists of 5 alpha helices, a short 3.10 helix and a 3-stranded mixed beta-sheet [PUBMED:7966331].
  3. Retroviral aspartyl protease
    Single domain aspartyl proteases from retroviruses, retrotransposons, and badnaviruses (plant dsDNA viruses). These proteases are generally part of a larger polyprotein; usually pol, more rarely gag. Retroviral proteases appear to be homologous to a single domain of the two-domain eukaryotic aspartyl proteases such as pepsins, cathepsins, and renins (PF00026).
  4. Adenoviral fibre protein (knob domain)
    Specific attachment of adenovirus is achieved through interactions between host-cell receptors and the adenovirus fibre protein and is mediated by the globular carboxy-terminal domain of the adenovirus fibre protein, termed the carboxy-terminal knob domain.
  5. Adenoviral fibre protein (repeat/shaft region)
    There is no separation between signal and noise. Specific attachment of adenovirus is achieved through interactions between host-cell receptors and the adenovirus fibre protein and is mediated by the globular carboxy-terminal domain of the adenovirus fibre protein, rather than the ‘shaft’ region represented by this family. The alignment of this family contains two copies of a fifteen residue repeat found in the ‘shaft’ region of adenoviral fibre proteins.
  6. Ankyrin repeat
    The ankyrin repeat is one of the most common protein-protein interaction motifs in nature. Ankyrin repeats are tandemly repeated modules of about 33 amino acids. They occur in a large number of functionally diverse proteins mainly from eukaryotes. The few known examples from prokaryotes and viruses may be the result of horizontal gene transfers [PUBMED:8108379]. The repeat has been found in proteins of diverse function such as transcriptional initiators, cell-cycle regulators, cytoskeletal, ion transporters and signal transducers. The ankyrin fold appears to be defined by its structure rather than its function since there is no specific sequence or structure which is universally recognised by it.
  7. Integrase core domain
    Retroviral integrase (IN) is an enzyme produced by a retrovirus (such as HIV) that enables its genetic material to be integrated into the DNA of the infected cell. Retroviral INs are not to be confused with phage integrases, such as λ phage integrase (Int) (see site-specific recombination). IN is a key component in the retroviral pre-integration complex (PIC). The complex of integrase bound to cognate viral DNA (vDNA) ends has been referred to as the intasome.[1]
    Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain PF02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain PF00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3’ ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyses the DNA strand transfer reaction of the 3’ ends of the viral DNA to the 5’ ends of the integration site [1].
  8. BTB/POZ domain
    The BTB (for BR-C, ttk and bab) [1] or POZ (for Pox virus and Zinc finger) [2] domain is present near the N-terminus of a fraction of zinc finger (PF00096) proteins and in proteins that contain the PF01344 motif such as Kelch and a family of pox virus proteins. The BTB/POZ domain mediates homomeric dimerisation and in some instances heteromeric dimerisation [2]. The structure of the dimerised PLZF BTB/POZ domain has been solved and consists of a tightly intertwined homodimer. The central scaffolding of the protein is made up of a cluster of alpha-helices flanked by short beta-sheets at both the top and bottom of the molecule [3]. POZ domains from several zinc finger proteins have been shown to mediate transcriptional repression and to interact with components of histone deacetylase co-repressor complexes including N-CoR and SMRT [4,5,6]. The POZ or BTB domain is also known as BR-C/Ttk or ZiN.
  9. RNase H
    RNase H digests the RNA strand of an RNA/DNA hybrid. Important enzyme in retroviral replication cycle, and often found as a domain associated with reverse transcriptases. Structure is a mixed alpha+beta fold with three a/b/a layers.
  10. Sushi repeat (SCR repeat)
    Sushi domain is an evolutionarily conserved protein domain. Sushi domains, also known as Complement control protein (CCP) modules, or short consensus repeats (SCR), exist in a wide variety of complement and adhesion proteins. The structure is known for this domain; it is based on a beta-sandwich arrangement - one face made up of three β-strands hydrogen-bonded to form a triple-stranded region at its centre, and the other face formed from two separate β-strands.[1]
    CD21 (also called C3d receptor, CR2, Epstein Barr virus receptor or EBV-R) is the receptor for EBV and for C3d, C3dg and iC3b. Complement components may activate B cells through CD21. CD21 is part of a large signal-transduction complex that also involves CD19, CD81, and Leu13.
    Some of the proteins in this group are responsible for the molecular basis of the blood group antigens, surface markers on the outside of the red blood cell membrane. Most of these markers are proteins, but some are carbohydrates attached to lipids or proteins.[2] Complement decay-accelerating factor (Antigen CD55) belongs to the Cromer blood group system and is associated with Cr(a), Dr(a), Es(a), Tc(a/b/c), Wd(a), WES(a/b), IFC and UMC antigens. Complement receptor type 1 (C3b/C4b receptor) (Antigen CD35) belongs to the Knops blood group system and is associated with Kn(a/b), McC(a), Sl(a) and Yk(a) antigens.

SUMMARY:
- Proteins for transcription (helicase, RNase), capsid formation (surface markers), reproduction (attachment to host-cell-receptors, integrase, surface markers)

Top-10 Eukaryotic PFAM-Families

top_eukaryota = subset(d_superkingdom, Superkingdom=="Eukaryota")
(top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],])
##     Superkingdom PFAM_Name
## 126    Eukaryota      WD40
## 59     Eukaryota   zf-C2H2
## 874    Eukaryota     LRR_8
## 855    Eukaryota EF-hand_7
## 47     Eukaryota     RRM_1
## 133    Eukaryota       LIM
## 352    Eukaryota       PPR
## 826    Eukaryota     PPR_2
## 155    Eukaryota     TPR_1
## 314    Eukaryota  Collagen
##                                                   PFAM_desc   model
## 126                                WD domain, G-beta repeat PF00400
## 59                                   Zinc finger, C2H2 type PF00096
## 874                                     Leucine rich repeat PF13855
## 855                                     EF-hand domain pair PF13499
## 47  RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
## 133                                              LIM domain PF00412
## 352                                              PPR repeat PF01535
## 826                                       PPR repeat family PF13041
## 155                                Tetratricopeptide repeat PF00515
## 314                Collagen triple helix repeat (20 copies) PF01391
##     is_chloroplastic is_mitochondrial count
## 126            FALSE            FALSE  1449
## 59             FALSE            FALSE   828
## 874            FALSE            FALSE   587
## 855            FALSE            FALSE   520
## 47             FALSE            FALSE   413
## 133            FALSE            FALSE   260
## 352            FALSE            FALSE   226
## 826            FALSE            FALSE   225
## 155            FALSE            FALSE   184
## 314            FALSE            FALSE   181
  1. WD domain, G-beta repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase.[3][4]
  2. Zinc finger, C2H2 type
    The C2H2 zinc finger is the classical zinc finger domain. The two conserved cysteines and histidines co-ordinate a zinc ion. The following pattern describes the zinc finger. #-X-C-X(1-5)-C-X3-#-X5-#-X2-H-X(3-6)-[H/C] Where X can be any amino acid, and numbers in brackets indicate the number of residues. The positions marked # are those that are important for the stable fold of the zinc finger. The final position can be either his or cys. The C2H2 zinc finger is composed of two short beta strands followed by an alpha helix. The amino terminal part of the helix binds the major groove in DNA binding zinc fingers. The accepted consensus binding sequence for Sp1 is usually defined by the asymmetric hexanucleotide core GGGCGG but this sequence does not include, among others, the GAG (=CTC) repeat that constitutes a high-affinity site for Sp1 binding to the wt1 promoter [2].
    A zinc finger is a small protein structural motif that is characterized by the coordination of one or more zinc ions (Zn2+) in order to stabilize the fold. Originally coined to describe the finger-like appearance of a hypothesized structure from Xenopus laevis transcription factor IIIA, the zinc finger name has now come to encompass a wide variety of differing protein structures.[1] Xenopus laevis TFIIIA was originally demonstrated to contain zinc and require the metal for function in 1983, the first such reported zinc requirement for a gene regulatory protein.[2][3] Proteins that contain zinc fingers (zinc finger proteins) are classified into several different structural families. Unlike many other clearly defined supersecondary structures such as Greek keys or β hairpins, there are a number of types of zinc fingers, each with a unique three-dimensional architecture. A particular zinc finger protein’s class is determined by this three-dimensional structure, but it can also be recognized based on the primary structure of the protein or the identity of the ligands coordinating the zinc ion. In spite of the large variety of these proteins, however, the vast majority typically function as interaction modules that bind DNA, RNA, proteins, or other small, useful molecules, and variations in structure serve primarily to alter the binding specificity of a particular protein. Since their original discovery and the elucidation of their structure, these interaction modules have proven ubiquitous in the biological world and may be found in 3% of the genes of the human genome.[4] In addition, zinc fingers have become extremely useful in various therapeutic and research capacities. Engineering zinc fingers to have an affinity for a specific sequence is an area of active research, and zinc finger nucleases and zinc finger transcription factors are two of the most important applications of this to be realized to date.
  3. Leucine rich repeat
    A leucine-rich repeat (LRR) is a protein structural motif that forms an α/β horseshoe fold.[1][2] It is composed of repeating 20–30 amino acid stretches that are unusually rich in the hydrophobic amino acid leucine. These repeats commonly fold together to form a solenoid protein domain, termed leucine-rich repeat domain. Typically, each repeat unit has beta strand-turn-alpha helix structure, and the assembled domain, composed of many such repeats, has a horseshoe shape with an interior parallel beta sheet and an exterior array of helices. One face of the beta sheet and one side of the helix array are exposed to solvent and are therefore dominated by hydrophilic residues. The region between the helices and sheets is the protein’s hydrophobic core and is tightly sterically packed with leucine residues. Leucine-rich repeats are frequently involved in the formation of protein–protein interactions.
    Leucine-rich repeats (LRR) consist of 2-45 motifs of 20-30 amino acids in length that generally folds into an arc or horseshoe shape [PUBMED:14747988]. LRRs occur in proteins ranging from viruses to eukaryotes, and appear to provide a structural framework for the formation of protein-protein interactions [PUBMED:11751054, PUBMED:1657640].Proteins containing LRRs include tyrosine kinase receptors, cell-adhesion molecules, virulence factors, and extracellular matrix-binding glycoproteins, and are involved in a variety of biological processes, including signal transduction, cell adhesion, DNA repair, recombination, transcription, RNA processing, disease resistance, apoptosis, and the immune response [PUBMED:2176636].
    Sequence analyses of LRR proteins suggested the existence of several different subfamilies of LRRs. The significance of this classification is that repeats from different subfamilies never occur simultaneously and have most probably evolved independently. It is, however, now clear that all major classes of LRR have curved horseshoe structures with a parallel beta sheet on the concave side and mostly helical elements on the convex side. At least six families of LRR proteins, characterised by different lengths and consensus sequences of the repeats, have been identified. Eleven-residue segments of the LRRs (LxxLxLxxN/CxL), corresponding to the beta-strand and adjacent loop regions, are conserved in LRR proteins, whereas the remaining parts of the repeats (herein termed variable) may be very different. Despite the differences, each of the variable parts contains two half-turns at both ends and a “linear” segment (as the chain follows a linear path overall), usually formed by a helix, in the middle. The concave face and the adjacent loops are the most common protein interaction surfaces on LRR proteins. 3D structure of some LRR proteins-ligand complexes show that the concave surface of LRR domain is ideal for interaction with alpha-helix, thus supporting earlier conclusions that the elongated and curved LRR structure provides an outstanding framework for achieving diverse protein-protein interactions [PUBMED:11751054]. Molecular modeling suggests that the conserved pattern LxxLxL, which is shorter than the previously proposed LxxLxLxxN/CxL is sufficient to impart the characteristic horseshoe curvature to proteins with 20- to 30-residue repeats [PUBMED:11967365].
    Leucine-rich repeat motifs have been identified in a large number of functionally unrelated proteins.[5] The best-known example is the ribonuclease inhibitor, but other proteins such as the tropomyosin regulator tropomodulin and the toll-like receptor also share the motif. In fact, the toll-like receptor possesses 10 successive LRR motifs which serve to bind pathogen- and danger-associated molecular patterns. Although the canonical LRR protein contains approximately one helix for every beta strand, variants that form beta-alpha superhelix folds sometimes have long loops rather than helices linking successive beta strands. One leucine-rich repeat variant domain (LRV) has a novel repetitive structural motif consisting of alternating alpha- and 310-helices arranged in a right-handed superhelix, with the absence of the beta-sheets present in other leucine-rich repeats.[6]
  4. EF-hand domain pair
    The EF hand is a helix-loop-helix structural domain or motif found in a large family of calcium-binding proteins. The EF-hand motif contains a helix-loop-helix topology, much like the spread thumb and forefinger of the human hand, in which the Ca2+ ions are coordinated by ligands within the loop. The motif takes its name from traditional nomenclature used in describing the protein parvalbumin, which contains three such motifs and is probably involved in muscle relaxation via its calcium-binding activity. The EF-hand consists of two alpha helices linked by a short loop region (usually about 12 amino acids) that usually binds calcium ions. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. example: Aequorin is a calcium binding protein (CaBP) isolated from the coelenterate Aequorea victoria. Aequorin belongs to the EF-hand family of CaBPs, with EF-hand loops that are closely related to CaBPs in mammals. In addition, aequorin has been used for years as an indicator of Ca2+ and has been shown to be safe and well tolerated by cells. Aequorin is made up of two components – the calcium binding component apoaequorin (AQ) and the chemiluminescent molecule coelenterazine. The AQ portion of this protein contains the EF-hand calcium binding domains.[2]
  5. RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain)
    RNA recognition motif, RNP-1 is a putative RNA-binding domain of about 90 amino acids that are known to bind single-stranded RNAs. It was found in many eukaryotic proteins.[1][2][3] The largest group of single strand RNA-binding protein is the eukaryotic RNA recognition motif (RRM) family that contains an eight amino acid RNP-1 consensus sequence.[4][5] RRM proteins have a variety of RNA binding preferences and functions, and include heterogeneous nuclear ribonucleoproteins (hnRNPs), proteins implicated in regulation of alternative splicing (SR, U2AF2, Sxl), protein components of small nuclear ribonucleoproteins (U1 and U2 snRNPs), and proteins that regulate RNA stability and translation (PABP, La, Hu).[2][3][5] The RRM in heterodimeric splicing factor U2 snRNP auxiliary factor appears to have two RRM-like domains with specialised features for protein recognition.[6] The motif also appears in a few single stranded DNA binding proteins. The typical RRM consists of four anti-parallel beta-strands and two alpha-helices arranged in a beta-alpha-beta-beta-alpha-beta fold with side chains that stack with RNA bases. A third helix is present during RNA binding in some cases.[7] The RRM is reviewed in a number of publications.[8][9][10]
    The RRM motif is probably diagnostic of an RNA binding protein. RRMs are found in a variety of RNA binding proteins, including various hnRNP proteins, proteins implicated in regulation of alternative splicing, and protein components of snRNPs. The motif also appears in a few single stranded DNA binding proteins. The RRM structure consists of four strands and two helices arranged in an alpha/beta sandwich, with a third helix present during RNA binding in some cases The C-terminal beta strand (4th strand) and final helix are hard to align and have been omitted in the SEED alignment The LA proteins (P05455) have an N terminal rrm which is included in the seed. There is a second region towards the C terminus that has some features characteristic of a rrm but does not appear to have the important structural core of a rrm. The LA proteins (P05455) are one of the main autoantigens in Systemic lupus erythematosus (SLE), an autoimmune disease.
  6. PPR repeat family
    This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR [1].
  7. PPR repeat
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  8. LIM domain
    LIM domains are protein structural domains, composed of two contiguous zinc finger domains, separated by a two-amino acid residue hydrophobic linker.[1] They are named after their initial discovery in the proteins Lin11, Isl-1 & Mec-3.[2] LIM-domain containing proteins have been shown to play roles in cytoskeletal organisation, organ development and oncogenesis. LIM-domains mediate protein–protein interactions that are critical to cellular processes. LIM domains have highly divergent sequences, apart from certain key residues. The sequence divergence allow a great many different binding sites to be grafted onto the same basic domain. The conserved residues are those involved in zinc binding or the hydrophobic core of the protein. The sequence signature of LIM domains is as follows: [C]-[X]2–4-[C]-[X]13–19-[W]-[H]-[X]2–4-[C]-[F]-[LVI]-[C]-[X]2–4-[C]-[X]13–20-C-[X]2–4-[C] LIM domains frequently occur in multiples, as seen in proteins such as TES, LMO4, and can also be attached to other domains in order to confer a binding or targeting function upon them, such as LIM-kinase. The LIM superclass of genes have been classified into 14 classes: ABLIM, CRP, ENIGMA, EPLIN, LASP, LHX, LMO, LIMK, LMO7, MICAL, PXN, PINCH, TES, and ZYX. Six of these classes (i.e., ABLIM, MICAL, ENIGMA, ZYX, LHX, LM07) originated in the stem lineage of animals, and this expansion is thought to have made a major contribution to the origin of animal multicellularity.[3] LIM domains are also found in various bacterial lineages where they are typically fused to a metallopeptidase domain. Some versions show fusions to an inactive P-loop NTPase at their N-terminus and a single transmembrane helix. These domain fusions suggest that the prokaryotic LIM domains are likely to regulate protein processing at the cell membrane. The domain architectural syntax is remarkable parallel to those of the prokaryotic versions of the B-box zinc finger and the AN1 zinc finger domains.
  9. Tetratricopeptide repeat
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.
  10. Tetratricopeptide repeat
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.
  11. Collagen triple helix repeat (20 copies)
    In collagen, the collagen helix, or type-2 helix, is a major shape in secondary structure. It consists of a triple helix made of the repetitious amino acid sequence glycine - X - Y, where X and Y are frequently proline or hydroxyproline.[2][3]A collagen triple helix has 3.3 residues per turn.[4] Each of the three chains is stabilized by the steric repulsion due to the pyrrolidine rings of proline and hydroxyproline residues. The pyrrolidine rings keep out of each other’s way when the polypeptide chain assumes this extended helical form, which is much more open than the tightly coiled form of the alpha helix. The three chains are hydrogen bonded to each other. The hydrogen bond donors are the peptide NH groups of glycine residues. The hydrogen bond acceptors are the CO groups of residues on the other chains. The OH group of hydroxyproline also participates in hydrogen bonding. The rise of the collagen helix (superhelix) is 2.9 Å (0.29 nm) per residue.
    Members of this family belong to the collagen superfamily [1]. Collagens are generally extracellular structural proteins involved in formation of connective tissue structure. The alignment contains 20 copies of the G-X-Y repeat that forms a triple helix. The first position of the repeat is glycine, the second and third positions can be any residue but are frequently proline and hydroxy-proline. Collagens are post translationally modified by proline hydroxylase to form the hydroxy-proline residues. Defective hydroxylation is the cause of scurvy. Some members of the collagen superfamily are not involved in connective tissue structure but share the same triple helical structure. The family includes bacterial collagen-like triple-helix repeat proteins [2,3].
  12. Calcium-binding EGF domain
    In molecular biology, the calcium-binding EGF domain is a conserved domain of about forty amino-acid residues found in epidermal growth factor (EGF). This domain is present in a large number of membrane-bound and extracellular, mostly animal, proteins.[1][2][3][4][5] Many of these proteins require calcium for their biological function and a calcium-binding site has been found at the N-terminus of some EGF-like domains.[6] Calcium-binding may be crucial for numerous protein-protein interactions. For human coagulation factor IX it has been shown that the calcium-ligands form a pentagonal bipyramid.[7] The first, third and fourth conserved negatively charged or polar residues are side chain ligands. The latter is possibly hydroxylated.[6] A conserved aromatic residue, as well as the second conserved negative residue, are thought to be involved in stabilising the calcium-binding site. As in non-calcium binding EGF-like domains, there are six conserved cysteines and the structure of both types is very similar as calcium-binding induces only strictly local structural changes.[6]
    A sequence of about forty amino-acid residues found in epidermal growth factor (EGF) has been shown [PUBMED:2288911, PUBMED:6334307, PUBMED:3534958, PUBMED:6607417, PUBMED:3282918] to be present in a large number of membrane-bound and extracellular, mostly animal, proteins. Many of these proteins require calcium for their biological function and a calcium-binding site has been found at the N terminus of some EGF-like domains [PUBMED:1527084]. Calcium-binding may be crucial for numerous protein-protein interactions. For human coagulation factor IX it has been shown [PUBMED:7606779] that the calcium-ligands form a pentagonal bipyramid. The first, third and fourth conserved negatively charged or polar residues are side chain ligands. The latter is possibly hydroxylated (see aspartic acid and asparagine hydroxylation site) [PUBMED:1527084]. A conserved aromatic residue, as well as the second conserved negative residue, are thought to be involved in stabilising the calcium-binding site. As in non-calcium binding EGF-like domains, there are six conserved cysteines and the structure of both types is very similar as calcium-binding induces only strictly local structural changes [PUBMED:1527084].
  13. Leucine Rich Repeat
    A leucine-rich repeat (LRR) is a protein structural motif that forms an α/β horseshoe fold.[1][2] It is composed of repeating 20–30 amino acid stretches that are unusually rich in the hydrophobic amino acid leucine. These repeats commonly fold together to form a solenoid protein domain, termed leucine-rich repeat domain. Typically, each repeat unit has beta strand-turn-alpha helix structure, and the assembled domain, composed of many such repeats, has a horseshoe shape with an interior parallel beta sheet and an exterior array of helices. One face of the beta sheet and one side of the helix array are exposed to solvent and are therefore dominated by hydrophilic residues. The region between the helices and sheets is the protein’s hydrophobic core and is tightly sterically packed with leucine residues. Leucine-rich repeats are frequently involved in the formation of protein–protein interactions.
    Leucine-rich repeats (LRR) consist of 2-45 motifs of 20-30 amino acids in length that generally folds into an arc or horseshoe shape [PUBMED:14747988]. LRRs occur in proteins ranging from viruses to eukaryotes, and appear to provide a structural framework for the formation of protein-protein interactions [PUBMED:11751054, PUBMED:1657640].Proteins containing LRRs include tyrosine kinase receptors, cell-adhesion molecules, virulence factors, and extracellular matrix-binding glycoproteins, and are involved in a variety of biological processes, including signal transduction, cell adhesion, DNA repair, recombination, transcription, RNA processing, disease resistance, apoptosis, and the immune response [PUBMED:2176636].
    Sequence analyses of LRR proteins suggested the existence of several different subfamilies of LRRs. The significance of this classification is that repeats from different subfamilies never occur simultaneously and have most probably evolved independently. It is, however, now clear that all major classes of LRR have curved horseshoe structures with a parallel beta sheet on the concave side and mostly helical elements on the convex side. At least six families of LRR proteins, characterised by different lengths and consensus sequences of the repeats, have been identified. Eleven-residue segments of the LRRs (LxxLxLxxN/CxL), corresponding to the beta-strand and adjacent loop regions, are conserved in LRR proteins, whereas the remaining parts of the repeats (herein termed variable) may be very different. Despite the differences, each of the variable parts contains two half-turns at both ends and a “linear” segment (as the chain follows a linear path overall), usually formed by a helix, in the middle. The concave face and the adjacent loops are the most common protein interaction surfaces on LRR proteins. 3D structure of some LRR proteins-ligand complexes show that the concave surface of LRR domain is ideal for interaction with alpha-helix, thus supporting earlier conclusions that the elongated and curved LRR structure provides an outstanding framework for achieving diverse protein-protein interactions [PUBMED:11751054]. Molecular modeling suggests that the conserved pattern LxxLxL, which is shorter than the previously proposed LxxLxLxxN/CxL is sufficient to impart the characteristic horseshoe curvature to proteins with 20- to 30-residue repeats [PUBMED:11967365].
    Leucine-rich repeat motifs have been identified in a large number of functionally unrelated proteins.[5] The best-known example is the ribonuclease inhibitor, but other proteins such as the tropomyosin regulator tropomodulin and the toll-like receptor also share the motif. In fact, the toll-like receptor possesses 10 successive LRR motifs which serve to bind pathogen- and danger-associated molecular patterns. Although the canonical LRR protein contains approximately one helix for every beta strand, variants that form beta-alpha superhelix folds sometimes have long loops rather than helices linking successive beta strands. One leucine-rich repeat variant domain (LRV) has a novel repetitive structural motif consisting of alternating alpha- and 310-helices arranged in a right-handed superhelix, with the absence of the beta-sheets present in other leucine-rich repeats.[6] CAUTION: This Pfam may not find all Leucine Rich Repeats in a protein. Leucine Rich Repeats are short sequence motifs present in a number of proteins with diverse functions and cellular locations. These repeats are usually involved in protein-protein interactions. Each Leucine Rich Repeat is composed of a beta-alpha unit. These units form elongated non-globular structures. Leucine Rich Repeats are often flanked by cysteine rich domains.
  14. PPR repeat
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  15. Kelch motif
    The Kelch motif is a region of protein sequence found widely in proteins from bacteria and eukaryotes.[2] This sequence motif is composed of about 50 amino acid residues which form a structure of a four stranded beta-sheet “blade”. This sequence motif is found in between five and eight copies per protein which fold together to form a larger circular solenoid structure called a beta-propeller domain. The known functions of kelch-containing proteins are diverse:
    scruin is an actin cross-linking protein; galactose oxidase catalyses the oxidation of the hydroxyl group at the C6 position in D-galactose; neuraminidase hydrolyses sialic acid residues from glycoproteins; NanM is a sialic acid mutarotase, involved in efficient utilisation of sialic acid by bacteria; kelch may have a cytoskeletal function, as it is localised to the actin-rich ring canals that connect the 15 nurse cells to the developing oocyte in Drosophila.[4]
  16. EGF-like domain
    The EGF-like domain is an evolutionary conserved protein domain, which derives its name from the epidermal growth factor where it was first described. It comprises about 30 to 40 amino-acid residues and has been found in a large number of mostly animal proteins.[2][3] Most occurrences of the EGF-like domain are found in the extracellular domain of membrane-bound proteins or in proteins known to be secreted. An exception to this is the prostaglandin-endoperoxide synthase. The EGF-like domain includes 6 cysteine residues which in the epidermal growth factor have been shown to form 3 disulfide bonds. The structures of 4-disulfide EGF-domains have been solved from the laminin and integrin proteins. The main structure of EGF-like domains is a two-stranded β-sheet followed by a loop to a short C-terminal, two-stranded β-sheet. These two β-sheets are usually denoted as the major (N-terminal) and minor (C-terminal) sheets.[4] EGF-like domains frequently occur in numerous tandem copies in proteins: these repeats typically fold together to form a single, linear solenoid domain block as a functional unit. Has roles in immune system and apoptosis, Ca+-binding.
  17. Armadillo/beta-catenin-like repeat
    The armadillo (Arm) repeat is an approximately 40 amino acid long tandemly repeated sequence motif first identified in the Drosophila melanogaster segment polarity gene armadillo involved in signal transduction through wingless. Animal Arm-repeat proteins function in various processes, including intracellular signalling and cytoskeletal regulation, and include such proteins as beta-catenin, the junctional plaque protein plakoglobin, the adenomatous polyposis coli (APC) tumour suppressor protein, and the nuclear transport factor importin-alpha, amongst others [PUBMED:9770300]. A subset of these proteins is conserved across eukaryotic kingdoms. In higher plants, some Arm-repeat proteins function in intracellular signalling like their mammalian counterparts, while others have novel functions [PUBMED:12946625]. The 3-dimensional fold of an armadillo repeat is known from the crystal structure of beta-catenin, where the 12 repeats form a superhelix of alpha helices with three helices per unit [PUBMED:9298899]. The cylindrical structure features a positively charged grove, which presumably interacts with the acidic surfaces of the known interaction partners of beta-catenin.
  18. Pentatricopeptide repeat domain
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
    This family matches additional variants of the PPR repeat that were not captured by the model for PF01535 (PPR). In the case of the Arabidopsis protein UniProtKB:Q66GI4 the repeated helices in this N-terminal region, of protein-only RNase P (PRORP) enzymes, form the pentatricopeptide repeat (PPR) domain which enhances pre-tRNA binding affinity. PROPRP enzymes process precursor tRNAs in human mitochondria and in all tRNA-using compartments of Arabidopsis thaliana [1].
  19. KRAB box
    The Krueppel-associated box (KRAB) is a domain of around 75 amino acids that is found in the N-terminal part of about one third of eukaryotic Krueppel-type C2H2 zinc finger proteins (ZFPs) [PUBMED:14519192]. It is enriched in charged amino acids and can be divided into subregions A and B, which are predicted to fold into two amphipathic alpha-helices. The KRAB A and B boxes can be separated by variable spacer segments and many KRAB proteins contain only the A box [PUBMED:2023909]. The functions currently known for members of the KRAB-containing protein family include transcriptional repression of RNA polymerase I, II and III promoters, binding and splicing of RNA, and control of nucleolus function. The KRAB domain functions as a transcriptional repressor when tethered to the template DNA by a DNA-binding domain. A sequence of 45 amino acids in the KRAB A subdomain has been shown to be necessary and sufficient for transcriptional repression. The B box does not repress by itself but does potentiate the repression exerted by the KRAB A subdomain [PUBMED:8183939, PUBMED:8183940]. Gene silencing requires the binding of the KRAB domain to the RING-B box-coiled coil (RBCC) domain of the KAP-1/TIF1-beta corepressor. As KAP-1 binds to the heterochromatin proteins HP1, it has been proposed that the KRAB-ZFP-bound target gene could be silenced following recruitment to heterochromatin [PUBMED:10653693, PUBMED:10748030]. KRAB-ZFPs probably constitute the single largest class of transcription factors within the human genome [PUBMED:10360839]. The KRAB domain is generally encoded by two exons. The regions coded by the two exons are known as KRAB-A and KRAB-B. Although the function of KRAB-ZFPs is largely unknown, they appear to play important roles during cell differentiation and development. These proteins have been shown to play important roles in cell differentiation and organ development, and in regulating viral replication and transcription. A KRAB domain may consist of an A-box, or of an A-box plus either a B-box, a divergent B-box (b), or a C-box. Only the A-box is included in this model. The A-box is needed for repression, the B- and C- boxes are not. KRAB-ZFPs have one or two KRAB domains at their amino-terminal end, and multiple C2H2 zinc finger motifs at their C-termini. Some KRAB-ZFPs also contain a SCAN domain which mediates homo- and hetero-oligomerization. The KRAB domain is a protein-protein interaction module which represses transcription through recruiting corepressors. A key mechanism appears to be the following: KRAB-AFPs tethered to DNA recruit, via their KRAB domain, the repressor KAP1 (KRAB-associated protein-1, also known as transcription intermediary factor 1 beta, KRAB-A interacting protein and tripartite motif protein 28). The KAP1/ KRAB-AFP complex in turn recruits the heterochromatin protein 1 (HP1) family, and other chromatin modulating proteins, leading to transcriptional repression through heterochromatin formation.
  20. Tetratricopeptide repeat
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.

SUMMARY:
- Transcription (Zn-finger for adhesion to DNA, RNA and lipids, RNA-binding through RNA recognition motif, assembly of multiprotein complexes (TPR), transcriptional repression, RNA-pol binding, and RNA-splicing by KRAB box)
- electron/ion-binding and transport -> muscle relaxation (EF-hand domain pair)
- Cytoskeletal organisation (KELCH), organ development and oncogenesis (LIM, collagen forming connective tissue)

Top-10 Viridiplantae PFAM-Families

top_viridiplantae = subset(d_kingdom, Superkingdom=="Eukaryota" & Kingdom == "Viridiplantae" & is_mitochondrial==FALSE & is_chloroplastic==FALSE)
(top_viridiplantae <- top_viridiplantae[order(top_viridiplantae$count,decreasing=T)[1:TOP],])
##      Superkingdom       Kingdom PFAM_Name
## 496     Eukaryota Viridiplantae       PPR
## 1078    Eukaryota Viridiplantae     PPR_2
## 1122    Eukaryota Viridiplantae EF-hand_7
## 186     Eukaryota Viridiplantae      WD40
## 1154    Eukaryota Viridiplantae     LRR_8
## 1064    Eukaryota Viridiplantae     PPR_1
## 266     Eukaryota Viridiplantae     LRR_1
## 885     Eukaryota Viridiplantae     TPR_2
## 1144    Eukaryota Viridiplantae     PPR_3
## 63      Eukaryota Viridiplantae     RRM_1
##                                                    PFAM_desc   model
## 496                                               PPR repeat PF01535
## 1078                                       PPR repeat family PF13041
## 1122                                     EF-hand domain pair PF13499
## 186                                 WD domain, G-beta repeat PF00400
## 1154                                     Leucine rich repeat PF13855
## 1064                                              PPR repeat PF12854
## 266                                      Leucine Rich Repeat PF00560
## 885                                 Tetratricopeptide repeat PF07719
## 1144                         Pentatricopeptide repeat domain PF13812
## 63   RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
##      is_chloroplastic is_mitochondrial count
## 496             FALSE            FALSE   226
## 1078            FALSE            FALSE   225
## 1122            FALSE            FALSE   170
## 186             FALSE            FALSE   123
## 1154            FALSE            FALSE   117
## 1064            FALSE            FALSE    83
## 266             FALSE            FALSE    72
## 885             FALSE            FALSE    68
## 1144            FALSE            FALSE    54
## 63              FALSE            FALSE    50
  1. PPR repeat (PPR)
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  2. PPR repeat family (PPR_2)
    This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR [1].
  3. EF-hand domain pair
    The EF hand is a helix-loop-helix structural domain or motif found in a large family of calcium-binding proteins. The EF-hand motif contains a helix-loop-helix topology, much like the spread thumb and forefinger of the human hand, in which the Ca2+ ions are coordinated by ligands within the loop. The motif takes its name from traditional nomenclature used in describing the protein parvalbumin, which contains three such motifs and is probably involved in muscle relaxation via its calcium-binding activity. The EF-hand consists of two alpha helices linked by a short loop region (usually about 12 amino acids) that usually binds calcium ions. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. example: Aequorin is a calcium binding protein (CaBP) isolated from the coelenterate Aequorea victoria. Aequorin belongs to the EF-hand family of CaBPs, with EF-hand loops that are closely related to CaBPs in mammals. In addition, aequorin has been used for years as an indicator of Ca2+ and has been shown to be safe and well tolerated by cells. Aequorin is made up of two components – the calcium binding component apoaequorin (AQ) and the chemiluminescent molecule coelenterazine. The AQ portion of this protein contains the EF-hand calcium binding domains.[2]
  4. WD domain, G-beta repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase.[3][4]
  5. Leucine rich repeat (LRR_8)
    A leucine-rich repeat (LRR) is a protein structural motif that forms an α/β horseshoe fold.[1][2] It is composed of repeating 20–30 amino acid stretches that are unusually rich in the hydrophobic amino acid leucine. These repeats commonly fold together to form a solenoid protein domain, termed leucine-rich repeat domain. Typically, each repeat unit has beta strand-turn-alpha helix structure, and the assembled domain, composed of many such repeats, has a horseshoe shape with an interior parallel beta sheet and an exterior array of helices. One face of the beta sheet and one side of the helix array are exposed to solvent and are therefore dominated by hydrophilic residues. The region between the helices and sheets is the protein’s hydrophobic core and is tightly sterically packed with leucine residues. Leucine-rich repeats are frequently involved in the formation of protein–protein interactions.
    Leucine-rich repeats (LRR) consist of 2-45 motifs of 20-30 amino acids in length that generally folds into an arc or horseshoe shape [PUBMED:14747988]. LRRs occur in proteins ranging from viruses to eukaryotes, and appear to provide a structural framework for the formation of protein-protein interactions [PUBMED:11751054, PUBMED:1657640].Proteins containing LRRs include tyrosine kinase receptors, cell-adhesion molecules, virulence factors, and extracellular matrix-binding glycoproteins, and are involved in a variety of biological processes, including signal transduction, cell adhesion, DNA repair, recombination, transcription, RNA processing, disease resistance, apoptosis, and the immune response [PUBMED:2176636].
    Sequence analyses of LRR proteins suggested the existence of several different subfamilies of LRRs. The significance of this classification is that repeats from different subfamilies never occur simultaneously and have most probably evolved independently. It is, however, now clear that all major classes of LRR have curved horseshoe structures with a parallel beta sheet on the concave side and mostly helical elements on the convex side. At least six families of LRR proteins, characterised by different lengths and consensus sequences of the repeats, have been identified. Eleven-residue segments of the LRRs (LxxLxLxxN/CxL), corresponding to the beta-strand and adjacent loop regions, are conserved in LRR proteins, whereas the remaining parts of the repeats (herein termed variable) may be very different. Despite the differences, each of the variable parts contains two half-turns at both ends and a “linear” segment (as the chain follows a linear path overall), usually formed by a helix, in the middle. The concave face and the adjacent loops are the most common protein interaction surfaces on LRR proteins. 3D structure of some LRR proteins-ligand complexes show that the concave surface of LRR domain is ideal for interaction with alpha-helix, thus supporting earlier conclusions that the elongated and curved LRR structure provides an outstanding framework for achieving diverse protein-protein interactions [PUBMED:11751054]. Molecular modeling suggests that the conserved pattern LxxLxL, which is shorter than the previously proposed LxxLxLxxN/CxL is sufficient to impart the characteristic horseshoe curvature to proteins with 20- to 30-residue repeats [PUBMED:11967365].
    Leucine-rich repeat motifs have been identified in a large number of functionally unrelated proteins.[5] The best-known example is the ribonuclease inhibitor, but other proteins such as the tropomyosin regulator tropomodulin and the toll-like receptor also share the motif. In fact, the toll-like receptor possesses 10 successive LRR motifs which serve to bind pathogen- and danger-associated molecular patterns. Although the canonical LRR protein contains approximately one helix for every beta strand, variants that form beta-alpha superhelix folds sometimes have long loops rather than helices linking successive beta strands. One leucine-rich repeat variant domain (LRV) has a novel repetitive structural motif consisting of alternating alpha- and 310-helices arranged in a right-handed superhelix, with the absence of the beta-sheets present in other leucine-rich repeats.[6]
  6. PPR repeat (PPR_1) This family matches additional variants of the PPR repeat that were not captured by the model for PF01535 (PPR). The exact function is not known.
    Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  7. Leucine Rich Repeat
    A leucine-rich repeat (LRR) is a protein structural motif that forms an α/β horseshoe fold.[1][2] It is composed of repeating 20–30 amino acid stretches that are unusually rich in the hydrophobic amino acid leucine. These repeats commonly fold together to form a solenoid protein domain, termed leucine-rich repeat domain. Typically, each repeat unit has beta strand-turn-alpha helix structure, and the assembled domain, composed of many such repeats, has a horseshoe shape with an interior parallel beta sheet and an exterior array of helices. One face of the beta sheet and one side of the helix array are exposed to solvent and are therefore dominated by hydrophilic residues. The region between the helices and sheets is the protein’s hydrophobic core and is tightly sterically packed with leucine residues. Leucine-rich repeats are frequently involved in the formation of protein–protein interactions.
    Leucine-rich repeats (LRR) consist of 2-45 motifs of 20-30 amino acids in length that generally folds into an arc or horseshoe shape [PUBMED:14747988]. LRRs occur in proteins ranging from viruses to eukaryotes, and appear to provide a structural framework for the formation of protein-protein interactions [PUBMED:11751054, PUBMED:1657640].Proteins containing LRRs include tyrosine kinase receptors, cell-adhesion molecules, virulence factors, and extracellular matrix-binding glycoproteins, and are involved in a variety of biological processes, including signal transduction, cell adhesion, DNA repair, recombination, transcription, RNA processing, disease resistance, apoptosis, and the immune response [PUBMED:2176636].
    Sequence analyses of LRR proteins suggested the existence of several different subfamilies of LRRs. The significance of this classification is that repeats from different subfamilies never occur simultaneously and have most probably evolved independently. It is, however, now clear that all major classes of LRR have curved horseshoe structures with a parallel beta sheet on the concave side and mostly helical elements on the convex side. At least six families of LRR proteins, characterised by different lengths and consensus sequences of the repeats, have been identified. Eleven-residue segments of the LRRs (LxxLxLxxN/CxL), corresponding to the beta-strand and adjacent loop regions, are conserved in LRR proteins, whereas the remaining parts of the repeats (herein termed variable) may be very different. Despite the differences, each of the variable parts contains two half-turns at both ends and a “linear” segment (as the chain follows a linear path overall), usually formed by a helix, in the middle. The concave face and the adjacent loops are the most common protein interaction surfaces on LRR proteins. 3D structure of some LRR proteins-ligand complexes show that the concave surface of LRR domain is ideal for interaction with alpha-helix, thus supporting earlier conclusions that the elongated and curved LRR structure provides an outstanding framework for achieving diverse protein-protein interactions [PUBMED:11751054]. Molecular modeling suggests that the conserved pattern LxxLxL, which is shorter than the previously proposed LxxLxLxxN/CxL is sufficient to impart the characteristic horseshoe curvature to proteins with 20- to 30-residue repeats [PUBMED:11967365].
    Leucine-rich repeat motifs have been identified in a large number of functionally unrelated proteins.[5] The best-known example is the ribonuclease inhibitor, but other proteins such as the tropomyosin regulator tropomodulin and the toll-like receptor also share the motif. In fact, the toll-like receptor possesses 10 successive LRR motifs which serve to bind pathogen- and danger-associated molecular patterns. Although the canonical LRR protein contains approximately one helix for every beta strand, variants that form beta-alpha superhelix folds sometimes have long loops rather than helices linking successive beta strands. One leucine-rich repeat variant domain (LRV) has a novel repetitive structural motif consisting of alternating alpha- and 310-helices arranged in a right-handed superhelix, with the absence of the beta-sheets present in other leucine-rich repeats.[6]
    CAUTION: This Pfam may not find all Leucine Rich Repeats in a protein. Leucine Rich Repeats are short sequence motifs present in a number of proteins with diverse functions and cellular locations. These repeats are usually involved in protein-protein interactions. Each Leucine Rich Repeat is composed of a beta-alpha unit. These units form elongated non-globular structures. Leucine Rich Repeats are often flanked by cysteine rich domains.
  8. Tetratricopeptide repeat
    This Pfam entry includes outlying Tetratricopeptide-like repeats (TPR) that are not matched by PF00515 (TPR).
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.
  9. Pentatricopeptide repeat domain
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
    This family matches additional variants of the PPR repeat that were not captured by the model for PF01535 (PPR). In the case of the Arabidopsis protein UniProtKB:Q66GI4 the repeated helices in this N-terminal region, of protein-only RNase P (PRORP) enzymes, form the pentatricopeptide repeat (PPR) domain which enhances pre-tRNA binding affinity. PROPRP enzymes process precursor tRNAs in human mitochondria and in all tRNA-using compartments of Arabidopsis thaliana [1].
  10. RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain)
    RNA recognition motif, RNP-1 is a putative RNA-binding domain of about 90 amino acids that are known to bind single-stranded RNAs. It was found in many eukaryotic proteins.[1][2][3] The largest group of single strand RNA-binding protein is the eukaryotic RNA recognition motif (RRM) family that contains an eight amino acid RNP-1 consensus sequence.[4][5] RRM proteins have a variety of RNA binding preferences and functions, and include heterogeneous nuclear ribonucleoproteins (hnRNPs), proteins implicated in regulation of alternative splicing (SR, U2AF2, Sxl), protein components of small nuclear ribonucleoproteins (U1 and U2 snRNPs), and proteins that regulate RNA stability and translation (PABP, La, Hu).[2][3][5] The RRM in heterodimeric splicing factor U2 snRNP auxiliary factor appears to have two RRM-like domains with specialised features for protein recognition.[6] The motif also appears in a few single stranded DNA binding proteins. The typical RRM consists of four anti-parallel beta-strands and two alpha-helices arranged in a beta-alpha-beta-beta-alpha-beta fold with side chains that stack with RNA bases. A third helix is present during RNA binding in some cases.[7] The RRM is reviewed in a number of publications.[8][9][10]
    The RRM motif is probably diagnostic of an RNA binding protein. RRMs are found in a variety of RNA binding proteins, including various hnRNP proteins, proteins implicated in regulation of alternative splicing, and protein components of snRNPs. The motif also appears in a few single stranded DNA binding proteins. The RRM structure consists of four strands and two helices arranged in an alpha/beta sandwich, with a third helix present during RNA binding in some cases The C-terminal beta strand (4th strand) and final helix are hard to align and have been omitted in the SEED alignment The LA proteins (P05455) have an N terminal rrm which is included in the seed. There is a second region towards the C terminus that has some features characteristic of a rrm but does not appear to have the important structural core of a rrm. The LA proteins (P05455) are one of the main autoantigens in Systemic lupus erythematosus (SLE), an autoimmune disease.

SUMMARY:

TODO:
- look at PPR: http://www.plantcell.org/content/16/8/2089

Top-10 Metazoa PFAM-Families

top_metazoa = subset(d_kingdom, Superkingdom=="Eukaryota" & Kingdom == "Metazoa" & is_mitochondrial==FALSE)
(top_metazoa <- top_metazoa[order(top_metazoa$count,decreasing=T)[1:TOP],])
##      Superkingdom Kingdom PFAM_Name
## 185     Eukaryota Metazoa      WD40
## 77      Eukaryota Metazoa   zf-C2H2
## 1153    Eukaryota Metazoa     LRR_8
## 61      Eukaryota Metazoa     RRM_1
## 1121    Eukaryota Metazoa EF-hand_7
## 196     Eukaryota Metazoa       LIM
## 449     Eukaryota Metazoa  Collagen
## 854     Eukaryota Metazoa    EGF_CA
## 4       Eukaryota Metazoa       EGF
## 237     Eukaryota Metazoa     TPR_1
##                                                    PFAM_desc   model
## 185                                 WD domain, G-beta repeat PF00400
## 77                                    Zinc finger, C2H2 type PF00096
## 1153                                     Leucine rich repeat PF13855
## 61   RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
## 1121                                     EF-hand domain pair PF13499
## 196                                               LIM domain PF00412
## 449                 Collagen triple helix repeat (20 copies) PF01391
## 854                               Calcium-binding EGF domain PF07645
## 4                                            EGF-like domain PF00008
## 237                                 Tetratricopeptide repeat PF00515
##      is_chloroplastic is_mitochondrial count
## 185             FALSE            FALSE   826
## 77              FALSE            FALSE   805
## 1153            FALSE            FALSE   449
## 61              FALSE            FALSE   291
## 1121            FALSE            FALSE   263
## 196             FALSE            FALSE   242
## 449             FALSE            FALSE   181
## 854             FALSE            FALSE   174
## 4               FALSE            FALSE   135
## 237             FALSE            FALSE   126
  1. WD domain, G-beta repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase.[3][4]
  2. Zinc finger, C2H2 type The C2H2 zinc finger is the classical zinc finger domain. The two conserved cysteines and histidines co-ordinate a zinc ion. The following pattern describes the zinc finger: #-X-C-X(1-5)-C-X3-#-X5-#-X2-H-X(3-6)-[H/C] Where X can be any amino acid, and numbers in brackets indicate the number of residues. The positions marked # are those that are important for the stable fold of the zinc finger. The final position can be either his or cys. The C2H2 zinc finger is composed of two short beta strands followed by an alpha helix. The amino terminal part of the helix binds the major groove in DNA binding zinc fingers. The accepted consensus binding sequence for Sp1 is usually defined by the asymmetric hexanucleotide core GGGCGG but this sequence does not include, among others, the GAG (=CTC) repeat that constitutes a high-affinity site for Sp1 binding to the wt1 promoter [2].
    They were first identified as a DNA-binding motif in transcription factor TFIIIA from Xenopus laevis (African clawed frog), however they are now recognised to bind DNA, RNA, protein, and/or lipid substrates.[11][12][13][14][15] Their binding properties depend on the amino acid sequence of the finger domains and on the linker between fingers, as well as on the higher-order structures and the number of fingers. Znf domains are often found in clusters, where fingers can have different binding specificities. Znf motifs occur in several unrelated protein superfamilies, varying in both sequence and structure. They display considerable versatility in binding modes, even between members of the same class (e.g., some bind DNA, others protein), suggesting that Znf motifs are stable scaffolds that have evolved specialised functions. For example, Znf-containing proteins function in gene transcription, translation, mRNA trafficking, cytoskeleton organization, epithelial development, cell adhesion, protein folding, chromatin remodeling, and zinc sensing, to name but a few.[16] Zinc-binding motifs are stable structures, and they rarely undergo conformational changes upon binding their target.
  3. Leucine rich repeat (LRR_8) A leucine-rich repeat (LRR) is a protein structural motif that forms an α/β horseshoe fold.[1][2] It is composed of repeating 20–30 amino acid stretches that are unusually rich in the hydrophobic amino acid leucine. These repeats commonly fold together to form a solenoid protein domain, termed leucine-rich repeat domain. Typically, each repeat unit has beta strand-turn-alpha helix structure, and the assembled domain, composed of many such repeats, has a horseshoe shape with an interior parallel beta sheet and an exterior array of helices. One face of the beta sheet and one side of the helix array are exposed to solvent and are therefore dominated by hydrophilic residues. The region between the helices and sheets is the protein’s hydrophobic core and is tightly sterically packed with leucine residues. Leucine-rich repeats are frequently involved in the formation of protein–protein interactions.
    Leucine-rich repeats (LRR) consist of 2-45 motifs of 20-30 amino acids in length that generally folds into an arc or horseshoe shape [PUBMED:14747988]. LRRs occur in proteins ranging from viruses to eukaryotes, and appear to provide a structural framework for the formation of protein-protein interactions [PUBMED:11751054, PUBMED:1657640].Proteins containing LRRs include tyrosine kinase receptors, cell-adhesion molecules, virulence factors, and extracellular matrix-binding glycoproteins, and are involved in a variety of biological processes, including signal transduction, cell adhesion, DNA repair, recombination, transcription, RNA processing, disease resistance, apoptosis, and the immune response [PUBMED:2176636].
    Sequence analyses of LRR proteins suggested the existence of several different subfamilies of LRRs. The significance of this classification is that repeats from different subfamilies never occur simultaneously and have most probably evolved independently. It is, however, now clear that all major classes of LRR have curved horseshoe structures with a parallel beta sheet on the concave side and mostly helical elements on the convex side. At least six families of LRR proteins, characterised by different lengths and consensus sequences of the repeats, have been identified. Eleven-residue segments of the LRRs (LxxLxLxxN/CxL), corresponding to the beta-strand and adjacent loop regions, are conserved in LRR proteins, whereas the remaining parts of the repeats (herein termed variable) may be very different. Despite the differences, each of the variable parts contains two half-turns at both ends and a “linear” segment (as the chain follows a linear path overall), usually formed by a helix, in the middle. The concave face and the adjacent loops are the most common protein interaction surfaces on LRR proteins. 3D structure of some LRR proteins-ligand complexes show that the concave surface of LRR domain is ideal for interaction with alpha-helix, thus supporting earlier conclusions that the elongated and curved LRR structure provides an outstanding framework for achieving diverse protein-protein interactions [PUBMED:11751054]. Molecular modeling suggests that the conserved pattern LxxLxL, which is shorter than the previously proposed LxxLxLxxN/CxL is sufficient to impart the characteristic horseshoe curvature to proteins with 20- to 30-residue repeats [PUBMED:11967365].
    Leucine-rich repeat motifs have been identified in a large number of functionally unrelated proteins.[5] The best-known example is the ribonuclease inhibitor, but other proteins such as the tropomyosin regulator tropomodulin and the toll-like receptor also share the motif. In fact, the toll-like receptor possesses 10 successive LRR motifs which serve to bind pathogen- and danger-associated molecular patterns. Although the canonical LRR protein contains approximately one helix for every beta strand, variants that form beta-alpha superhelix folds sometimes have long loops rather than helices linking successive beta strands. One leucine-rich repeat variant domain (LRV) has a novel repetitive structural motif consisting of alternating alpha- and 310-helices arranged in a right-handed superhelix, with the absence of the beta-sheets present in other leucine-rich repeats.[6]
  4. RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain)
    RNA recognition motif, RNP-1 is a putative RNA-binding domain of about 90 amino acids that are known to bind single-stranded RNAs. It was found in many eukaryotic proteins.[1][2][3] The largest group of single strand RNA-binding protein is the eukaryotic RNA recognition motif (RRM) family that contains an eight amino acid RNP-1 consensus sequence.[4][5] RRM proteins have a variety of RNA binding preferences and functions, and include heterogeneous nuclear ribonucleoproteins (hnRNPs), proteins implicated in regulation of alternative splicing (SR, U2AF2, Sxl), protein components of small nuclear ribonucleoproteins (U1 and U2 snRNPs), and proteins that regulate RNA stability and translation (PABP, La, Hu).[2][3][5] The RRM in heterodimeric splicing factor U2 snRNP auxiliary factor appears to have two RRM-like domains with specialised features for protein recognition.[6] The motif also appears in a few single stranded DNA binding proteins. The typical RRM consists of four anti-parallel beta-strands and two alpha-helices arranged in a beta-alpha-beta-beta-alpha-beta fold with side chains that stack with RNA bases. A third helix is present during RNA binding in some cases.[7] The RRM is reviewed in a number of publications.[8][9][10]
    The RRM motif is probably diagnostic of an RNA binding protein. RRMs are found in a variety of RNA binding proteins, including various hnRNP proteins, proteins implicated in regulation of alternative splicing, and protein components of snRNPs. The motif also appears in a few single stranded DNA binding proteins. The RRM structure consists of four strands and two helices arranged in an alpha/beta sandwich, with a third helix present during RNA binding in some cases The C-terminal beta strand (4th strand) and final helix are hard to align and have been omitted in the SEED alignment The LA proteins (P05455) have an N terminal rrm which is included in the seed. There is a second region towards the C terminus that has some features characteristic of a rrm but does not appear to have the important structural core of a rrm. The LA proteins (P05455) are one of the main autoantigens in Systemic lupus erythematosus (SLE), an autoimmune disease.
  5. EF-hand domain pair The EF hand is a helix-loop-helix structural domain or motif found in a large family of calcium-binding proteins. The EF-hand motif contains a helix-loop-helix topology, much like the spread thumb and forefinger of the human hand, in which the Ca2+ ions are coordinated by ligands within the loop. The motif takes its name from traditional nomenclature used in describing the protein parvalbumin, which contains three such motifs and is probably involved in muscle relaxation via its calcium-binding activity. The EF-hand consists of two alpha helices linked by a short loop region (usually about 12 amino acids) that usually binds calcium ions. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. example: Aequorin is a calcium binding protein (CaBP) isolated from the coelenterate Aequorea victoria. Aequorin belongs to the EF-hand family of CaBPs, with EF-hand loops that are closely related to CaBPs in mammals. In addition, aequorin has been used for years as an indicator of Ca2+ and has been shown to be safe and well tolerated by cells. Aequorin is made up of two components – the calcium binding component apoaequorin (AQ) and the chemiluminescent molecule coelenterazine. The AQ portion of this protein contains the EF-hand calcium binding domains.[2]
  6. LIM domain
    LIM domains are protein structural domains, composed of two contiguous zinc finger domains, separated by a two-amino acid residue hydrophobic linker.[1] They are named after their initial discovery in the proteins Lin11, Isl-1 & Mec-3.[2] LIM-domain containing proteins have been shown to play roles in cytoskeletal organisation, organ development and oncogenesis. LIM-domains mediate protein–protein interactions that are critical to cellular processes. LIM domains have highly divergent sequences, apart from certain key residues. The sequence divergence allow a great many different binding sites to be grafted onto the same basic domain. The conserved residues are those involved in zinc binding or the hydrophobic core of the protein. The sequence signature of LIM domains is as follows: [C]-[X]2–4-[C]-[X]13–19-[W]-[H]-[X]2–4-[C]-[F]-[LVI]-[C]-[X]2–4-[C]-[X]13–20-C-[X]2–4-[C] LIM domains frequently occur in multiples, as seen in proteins such as TES, LMO4, and can also be attached to other domains in order to confer a binding or targeting function upon them, such as LIM-kinase. The LIM superclass of genes have been classified into 14 classes: ABLIM, CRP, ENIGMA, EPLIN, LASP, LHX, LMO, LIMK, LMO7, MICAL, PXN, PINCH, TES, and ZYX. Six of these classes (i.e., ABLIM, MICAL, ENIGMA, ZYX, LHX, LM07) originated in the stem lineage of animals, and this expansion is thought to have made a major contribution to the origin of animal multicellularity.[3] LIM domains are also found in various bacterial lineages where they are typically fused to a metallopeptidase domain. Some versions show fusions to an inactive P-loop NTPase at their N-terminus and a single transmembrane helix. These domain fusions suggest that the prokaryotic LIM domains are likely to regulate protein processing at the cell membrane. The domain architectural syntax is remarkable parallel to those of the prokaryotic versions of the B-box zinc finger and the AN1 zinc finger domains.
  7. Collagen triple helix repeat (20 copies)
    In collagen, the collagen helix, or type-2 helix, is a major shape in secondary structure. It consists of a triple helix made of the repetitious amino acid sequence glycine - X - Y, where X and Y are frequently proline or hydroxyproline.[2][3]A collagen triple helix has 3.3 residues per turn.[4] Each of the three chains is stabilized by the steric repulsion due to the pyrrolidine rings of proline and hydroxyproline residues. The pyrrolidine rings keep out of each other’s way when the polypeptide chain assumes this extended helical form, which is much more open than the tightly coiled form of the alpha helix. The three chains are hydrogen bonded to each other. The hydrogen bond donors are the peptide NH groups of glycine residues. The hydrogen bond acceptors are the CO groups of residues on the other chains. The OH group of hydroxyproline also participates in hydrogen bonding. The rise of the collagen helix (superhelix) is 2.9 Å (0.29 nm) per residue.
    Members of this family belong to the collagen superfamily [1]. Collagens are generally extracellular structural proteins involved in formation of connective tissue structure. The alignment contains 20 copies of the G-X-Y repeat that forms a triple helix. The first position of the repeat is glycine, the second and third positions can be any residue but are frequently proline and hydroxy-proline. Collagens are post translationally modified by proline hydroxylase to form the hydroxy-proline residues. Defective hydroxylation is the cause of scurvy. Some members of the collagen superfamily are not involved in connective tissue structure but share the same triple helical structure. The family includes bacterial collagen-like triple-helix repeat proteins [2,3].
  8. Calcium-binding EGF domain
    In molecular biology, the calcium-binding EGF domain is a conserved domain of about forty amino-acid residues found in epidermal growth factor (EGF). This domain is present in a large number of membrane-bound and extracellular, mostly animal, proteins.[1][2][3][4][5] Many of these proteins require calcium for their biological function and a calcium-binding site has been found at the N-terminus of some EGF-like domains.[6] Calcium-binding may be crucial for numerous protein-protein interactions. For human coagulation factor IX it has been shown that the calcium-ligands form a pentagonal bipyramid.[7] The first, third and fourth conserved negatively charged or polar residues are side chain ligands. The latter is possibly hydroxylated.[6] A conserved aromatic residue, as well as the second conserved negative residue, are thought to be involved in stabilising the calcium-binding site. As in non-calcium binding EGF-like domains, there are six conserved cysteines and the structure of both types is very similar as calcium-binding induces only strictly local structural changes.[6]
    A sequence of about forty amino-acid residues found in epidermal growth factor (EGF) has been shown [PUBMED:2288911, PUBMED:6334307, PUBMED:3534958, PUBMED:6607417, PUBMED:3282918] to be present in a large number of membrane-bound and extracellular, mostly animal, proteins. Many of these proteins require calcium for their biological function and a calcium-binding site has been found at the N terminus of some EGF-like domains [PUBMED:1527084]. Calcium-binding may be crucial for numerous protein-protein interactions. For human coagulation factor IX it has been shown [PUBMED:7606779] that the calcium-ligands form a pentagonal bipyramid. The first, third and fourth conserved negatively charged or polar residues are side chain ligands. The latter is possibly hydroxylated (see aspartic acid and asparagine hydroxylation site) [PUBMED:1527084]. A conserved aromatic residue, as well as the second conserved negative residue, are thought to be involved in stabilising the calcium-binding site. As in non-calcium binding EGF-like domains, there are six conserved cysteines and the structure of both types is very similar as calcium-binding induces only strictly local structural changes [PUBMED:1527084].
  9. EGF-like domain
    The EGF-like domain is an evolutionary conserved protein domain, which derives its name from the epidermal growth factor where it was first described. It comprises about 30 to 40 amino-acid residues and has been found in a large number of mostly animal proteins.[2][3] Most occurrences of the EGF-like domain are found in the extracellular domain of membrane-bound proteins or in proteins known to be secreted. An exception to this is the prostaglandin-endoperoxide synthase. The EGF-like domain includes 6 cysteine residues which in the epidermal growth factor have been shown to form 3 disulfide bonds. The structures of 4-disulfide EGF-domains have been solved from the laminin and integrin proteins. The main structure of EGF-like domains is a two-stranded β-sheet followed by a loop to a short C-terminal, two-stranded β-sheet. These two β-sheets are usually denoted as the major (N-terminal) and minor (C-terminal) sheets.[4] EGF-like domains frequently occur in numerous tandem copies in proteins: these repeats typically fold together to form a single, linear solenoid domain block as a functional unit. Has roles in immune system and apoptosis, Ca+-binding.
  10. Tetratricopeptide repeat The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.

SUMMARY:

Top-10 Fungi PFAM-Families

top_fungi = subset(d_kingdom, Superkingdom=="Eukaryota" & Kingdom == "Fungi" & is_mitochondrial==FALSE)
(top_fungi <- top_fungi[order(top_fungi$count,decreasing=T)[1:TOP],])
##      Superkingdom Kingdom PFAM_Name
## 184     Eukaryota   Fungi      WD40
## 62      Eukaryota   Fungi     RRM_1
## 1124    Eukaryota   Fungi EF-hand_7
## 78      Eukaryota   Fungi   zf-C2H2
## 388     Eukaryota   Fungi       PUF
## 910     Eukaryota   Fungi       NLE
## 235     Eukaryota   Fungi       Arm
## 241     Eukaryota   Fungi     TPR_1
## 9       Eukaryota   Fungi      KH_1
## 97      Eukaryota   Fungi   Hexapep
##                                                    PFAM_desc   model
## 184                                 WD domain, G-beta repeat PF00400
## 62   RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
## 1124                                     EF-hand domain pair PF13499
## 78                                    Zinc finger, C2H2 type PF00096
## 388                        Pumilio-family RNA binding repeat PF00806
## 910                                      NLE (NUC135) domain PF08154
## 235                       Armadillo/beta-catenin-like repeat PF00514
## 241                                 Tetratricopeptide repeat PF00515
## 9                                                  KH domain PF00013
## 97           Bacterial transferase hexapeptide (six repeats) PF00132
##      is_chloroplastic is_mitochondrial count
## 184             FALSE            FALSE   428
## 62              FALSE            FALSE    68
## 1124            FALSE            FALSE    32
## 78              FALSE            FALSE    23
## 388             FALSE            FALSE    23
## 910             FALSE            FALSE    20
## 235             FALSE            FALSE    19
## 241             FALSE            FALSE    19
## 9               FALSE            FALSE    18
## 97              FALSE            FALSE    17
  1. WD domain, G-beta repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase.[3][4]
  2. RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) RNA recognition motif, RNP-1 is a putative RNA-binding domain of about 90 amino acids that are known to bind single-stranded RNAs. It was found in many eukaryotic proteins.[1][2][3] The largest group of single strand RNA-binding protein is the eukaryotic RNA recognition motif (RRM) family that contains an eight amino acid RNP-1 consensus sequence.[4][5] RRM proteins have a variety of RNA binding preferences and functions, and include heterogeneous nuclear ribonucleoproteins (hnRNPs), proteins implicated in regulation of alternative splicing (SR, U2AF2, Sxl), protein components of small nuclear ribonucleoproteins (U1 and U2 snRNPs), and proteins that regulate RNA stability and translation (PABP, La, Hu).[2][3][5] The RRM in heterodimeric splicing factor U2 snRNP auxiliary factor appears to have two RRM-like domains with specialised features for protein recognition.[6] The motif also appears in a few single stranded DNA binding proteins. The typical RRM consists of four anti-parallel beta-strands and two alpha-helices arranged in a beta-alpha-beta-beta-alpha-beta fold with side chains that stack with RNA bases. A third helix is present during RNA binding in some cases.[7] The RRM is reviewed in a number of publications.[8][9][10]
    The RRM motif is probably diagnostic of an RNA binding protein. RRMs are found in a variety of RNA binding proteins, including various hnRNP proteins, proteins implicated in regulation of alternative splicing, and protein components of snRNPs. The motif also appears in a few single stranded DNA binding proteins. The RRM structure consists of four strands and two helices arranged in an alpha/beta sandwich, with a third helix present during RNA binding in some cases The C-terminal beta strand (4th strand) and final helix are hard to align and have been omitted in the SEED alignment The LA proteins (P05455) have an N terminal rrm which is included in the seed. There is a second region towards the C terminus that has some features characteristic of a rrm but does not appear to have the important structural core of a rrm. The LA proteins (P05455) are one of the main autoantigens in Systemic lupus erythematosus (SLE), an autoimmune disease.
  3. EF-hand domain pair
    The EF hand is a helix-loop-helix structural domain or motif found in a large family of calcium-binding proteins. The EF-hand motif contains a helix-loop-helix topology, much like the spread thumb and forefinger of the human hand, in which the Ca2+ ions are coordinated by ligands within the loop. The motif takes its name from traditional nomenclature used in describing the protein parvalbumin, which contains three such motifs and is probably involved in muscle relaxation via its calcium-binding activity. The EF-hand consists of two alpha helices linked by a short loop region (usually about 12 amino acids) that usually binds calcium ions. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. example: Aequorin is a calcium binding protein (CaBP) isolated from the coelenterate Aequorea victoria. Aequorin belongs to the EF-hand family of CaBPs, with EF-hand loops that are closely related to CaBPs in mammals. In addition, aequorin has been used for years as an indicator of Ca2+ and has been shown to be safe and well tolerated by cells. Aequorin is made up of two components – the calcium binding component apoaequorin (AQ) and the chemiluminescent molecule coelenterazine. The AQ portion of this protein contains the EF-hand calcium binding domains.[2]
  4. Zinc finger, C2H2 type
    The C2H2 zinc finger is the classical zinc finger domain. The two conserved cysteines and histidines co-ordinate a zinc ion. The following pattern describes the zinc finger: #-X-C-X(1-5)-C-X3-#-X5-#-X2-H-X(3-6)-[H/C] Where X can be any amino acid, and numbers in brackets indicate the number of residues. The positions marked # are those that are important for the stable fold of the zinc finger. The final position can be either his or cys. The C2H2 zinc finger is composed of two short beta strands followed by an alpha helix. The amino terminal part of the helix binds the major groove in DNA binding zinc fingers. The accepted consensus binding sequence for Sp1 is usually defined by the asymmetric hexanucleotide core GGGCGG but this sequence does not include, among others, the GAG (=CTC) repeat that constitutes a high-affinity site for Sp1 binding to the wt1 promoter [2].
    They were first identified as a DNA-binding motif in transcription factor TFIIIA from Xenopus laevis (African clawed frog), however they are now recognised to bind DNA, RNA, protein, and/or lipid substrates.[11][12][13][14][15] Their binding properties depend on the amino acid sequence of the finger domains and on the linker between fingers, as well as on the higher-order structures and the number of fingers. Znf domains are often found in clusters, where fingers can have different binding specificities. Znf motifs occur in several unrelated protein superfamilies, varying in both sequence and structure. They display considerable versatility in binding modes, even between members of the same class (e.g., some bind DNA, others protein), suggesting that Znf motifs are stable scaffolds that have evolved specialised functions. For example, Znf-containing proteins function in gene transcription, translation, mRNA trafficking, cytoskeleton organization, epithelial development, cell adhesion, protein folding, chromatin remodeling, and zinc sensing, to name but a few.[16] Zinc-binding motifs are stable structures, and they rarely undergo conformational changes upon binding their target.
  5. Pumilio-family RNA binding repeat
    Puf repeats (aka PUM-HD, Pumilio homology domain) are necessary and sufficient for sequence specific RNA binding in fly Pumilio and worm FBF-1 and FBF-2. Both proteins function as translational repressors in early embryonic development by binding sequences in the 3’ UTR of target mRNAs (e.g. the nanos response element (NRE) in fly Hunchback mRNA, or the point mutation element (PME) in worm fem-3 mRNA). Other proteins that contain Puf domains are also plausible RNA binding proteins. P47135 for instance, appears to also contain a single RRM domain by HMM analysis. Puf domains usually occur as a tandem repeat of 8 domains. The Pfam model does not necessarily recognise all 8 repeats in all sequences; some sequences appear to have 5 or 6 repeats on initial analysis, but further analysis suggests the presence of additional divergent repeats. Structures of PUF repeat proteins show they consist of a two helix structure [3,4].
  6. NLE (NUC135) domain
    This domain is located N terminal to WD40 repeats. It is found in the microtubule-associated protein Q12024 [1].
  7. Armadillo/beta-catenin-like repeat
    The armadillo (Arm) repeat is an approximately 40 amino acid long tandemly repeated sequence motif first identified in the Drosophila melanogaster segment polarity gene armadillo involved in signal transduction through wingless. Animal Arm-repeat proteins function in various processes, including intracellular signalling and cytoskeletal regulation, and include such proteins as beta-catenin, the junctional plaque protein plakoglobin, the adenomatous polyposis coli (APC) tumour suppressor protein, and the nuclear transport factor importin-alpha, amongst others [PUBMED:9770300]. A subset of these proteins is conserved across eukaryotic kingdoms. In higher plants, some Arm-repeat proteins function in intracellular signalling like their mammalian counterparts, while others have novel functions [PUBMED:12946625]. The 3-dimensional fold of an armadillo repeat is known from the crystal structure of beta-catenin, where the 12 repeats form a superhelix of alpha helices with three helices per unit [PUBMED:9298899]. The cylindrical structure features a positively charged grove, which presumably interacts with the acidic surfaces of the known interaction partners of beta-catenin.
  8. Tetratricopeptide repeat
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.
  9. KH domain
    The K Homology (KH) domain is a protein domain that was first identified in the human heterogeneous nuclear ribonucleoprotein (hnRNP) K. An evolutionarily conserved sequence of around 70 amino acids, the KH domain is present in a wide variety of nucleic acid-binding proteins. The KH domain binds RNA, and can function in RNA recognition.[1] It is found in multiple copies in several proteins, where they can function cooperatively or independently. For example, in the AU-rich element RNA-binding protein KSRP, which has 4 KH domains, KH domains 3 and 4 behave as independent binding modules to interact with different regions of the AU-rich RNA targets.[1] The solution structure of the first KH domain of FMR1 and of the C-terminal KH domain of hnRNP K determined by nuclear magnetic resonance (NMR) revealed a beta-alpha-alpha-beta-beta-alpha structure.[2][3] Autoantibodies to NOVA1, a KH domain protein, cause paraneoplastic opsoclonus ataxia. The KH domain is found at the N-terminus of the ribosomal protein S3. This domain is unusual in that it has a different fold compared to the normal KH domain.[4] KH domains bind to either RNA or single stranded DNA. The nucleic acid is bound in an extended conformation across one side of the domain. The binding occurs in a cleft formed between alpha helix 1, alpha helix 2 the GXXG loop (contains a highly conserved sequence motif) and the variable loop.[5] The binding cleft is hydrophobic in nature with a variety of additional protein specific interactions to stabilise the complex. Valverde and colleagues note that, “Nucleic acid base-to-protein aromatic side chain stacking interactions which are prevalent in other types of single stranded nucleic acid binding motifs, are notably absent in KH domain nucleic acid recognition”.[5]
  10. Bacterial transferase hexapeptide (six repeats) A variety of bacterial transferases contain a repeat structure composed of tandem repeats of a [LIV]-G-X(4) hexapeptide, which, in the tertiary structure of LpxA (UDP N-acetylglucosamine acyltransferase) [PUBMED:7481807], has been shown to form a left-handed parallel beta helix. A number of different transferase protein families contain this repeat, such as
    galactoside acetyltransferase-like proteins [PUBMED:11937062]: The galactoside acetyltransferase (thiogalactoside transacetylase) of Escherichia coli (GAT, LacA, EC 2.3.1.18) is a gene product of the classical lac operon. GAT may assist cellular detoxification by acetylating nonmetabolizable pyranosides, thereby preventing their reentry into the cell.
    the gamma-class of carbonic anhydrases [PUBMED:10924115]: Carbonic anhydrases (CA: EC:4.2.1.1) are zinc metalloenzymes which catalyse the reversible hydration of carbon dioxide to bicarbonate [PMID: 18336305, PMID: 10978542].
    and tetrahydrodipicolinate-N-succinlytransferases (DapD) the latter containing an extra N-terminal 3-helical domain [PUBMED:11910040]: Tetrahydrodipicolinate N-succinyltransferase (DapD) catalyzes the succinyl-CoA-dependent acylation of L-2-amino-6-oxopimelate to 2-N-succinyl-6-oxopimelate as part of the succinylase branch of the meso-diaminopimelate/lysine biosynthetic pathway of bacteria, blue-green algae, and plants. This pathway provides meso-diaminopimelate as a building block for cell wall peptidoglycan in most bacteria, and is regarded as a target pathway for antibacterial agents.

SUMMARY:
- RNA binding (pumillo)
- RNA recognition (KH-domain)

Top-10 chloroplastic PFAM-Families

top_chloroplastic = subset(d_superkingdom, is_chloroplastic==TRUE)
(top_chloroplastic <- top_chloroplastic[order(top_chloroplastic$count,decreasing=T)[1:TOP],])
##     Superkingdom     PFAM_Name
## 828    Eukaryota         PPR_2
## 354    Eukaryota           PPR
## 48     Eukaryota         RRM_1
## 869    Eukaryota         PPR_3
## 819    Eukaryota         PPR_1
## 171    Eukaryota Ribosomal_L12
## 108    Eukaryota  Ribosomal_L6
## 244    Eukaryota Homoserine_dh
## 287    Eukaryota          NifU
## 480    Eukaryota         Clp_N
##                                                     PFAM_desc   model
## 828                                         PPR repeat family PF13041
## 354                                                PPR repeat PF01535
## 48    RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
## 869                           Pentatricopeptide repeat domain PF13812
## 819                                                PPR repeat PF12854
## 171                Ribosomal protein L7/L12 C-terminal domain PF00542
## 108                                      Ribosomal protein L6 PF00347
## 244                                  Homoserine dehydrogenase PF00742
## 287                                          NifU-like domain PF01106
## 480 Clp amino terminal domain, pathogenicity island component PF02861
##     is_chloroplastic is_mitochondrial count
## 828             TRUE            FALSE    54
## 354             TRUE            FALSE    51
## 48              TRUE            FALSE    14
## 869             TRUE            FALSE    14
## 819             TRUE            FALSE    11
## 171             TRUE            FALSE     5
## 108             TRUE            FALSE     4
## 244             TRUE            FALSE     4
## 287             TRUE            FALSE     4
## 480             TRUE            FALSE     4
  1. PPR repeat family (PPR_2)
    This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR [1].
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  2. PPR repeat
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  3. RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain)
    RNA recognition motif, RNP-1 is a putative RNA-binding domain of about 90 amino acids that are known to bind single-stranded RNAs. It was found in many eukaryotic proteins.[1][2][3] The largest group of single strand RNA-binding protein is the eukaryotic RNA recognition motif (RRM) family that contains an eight amino acid RNP-1 consensus sequence.[4][5] RRM proteins have a variety of RNA binding preferences and functions, and include heterogeneous nuclear ribonucleoproteins (hnRNPs), proteins implicated in regulation of alternative splicing (SR, U2AF2, Sxl), protein components of small nuclear ribonucleoproteins (U1 and U2 snRNPs), and proteins that regulate RNA stability and translation (PABP, La, Hu).[2][3][5] The RRM in heterodimeric splicing factor U2 snRNP auxiliary factor appears to have two RRM-like domains with specialised features for protein recognition.[6] The motif also appears in a few single stranded DNA binding proteins. The typical RRM consists of four anti-parallel beta-strands and two alpha-helices arranged in a beta-alpha-beta-beta-alpha-beta fold with side chains that stack with RNA bases. A third helix is present during RNA binding in some cases.[7] The RRM is reviewed in a number of publications.[8][9][10]
    The RRM motif is probably diagnostic of an RNA binding protein. RRMs are found in a variety of RNA binding proteins, including various hnRNP proteins, proteins implicated in regulation of alternative splicing, and protein components of snRNPs. The motif also appears in a few single stranded DNA binding proteins. The RRM structure consists of four strands and two helices arranged in an alpha/beta sandwich, with a third helix present during RNA binding in some cases The C-terminal beta strand (4th strand) and final helix are hard to align and have been omitted in the SEED alignment The LA proteins (P05455) have an N terminal rrm which is included in the seed. There is a second region towards the C terminus that has some features characteristic of a rrm but does not appear to have the important structural core of a rrm. The LA proteins (P05455) are one of the main autoantigens in Systemic lupus erythematosus (SLE), an autoimmune disease.
  4. Pentatricopeptide repeat domain
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
    This family matches additional variants of the PPR repeat that were not captured by the model for PF01535 (PPR). In the case of the Arabidopsis protein UniProtKB:Q66GI4 the repeated helices in this N-terminal region, of protein-only RNase P (PRORP) enzymes, form the pentatricopeptide repeat (PPR) domain which enhances pre-tRNA binding affinity. PROPRP enzymes process precursor tRNAs in human mitochondria and in all tRNA-using compartments of Arabidopsis thaliana [1].
  5. PPR repeat (PPR_1)
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  6. Ribosomal protein L7/L12 C-terminal domain
    Ribosomes are the particles that catalyse mRNA-directed protein synthesis in all organisms. The codons of the mRNA are exposed on the ribosome to allow tRNA binding. This leads to the incorporation of amino acids into the growing polypeptide chain in accordance with the genetic information. Incoming amino acid monomers enter the ribosomal A site in the form of aminoacyl-tRNAs complexed with elongation factor Tu (EF-Tu) and GTP. The growing polypeptide chain, situated in the P site as peptidyl-tRNA, is then transferred to aminoacyl-tRNA and the new peptidyl-tRNA, extended by one residue, is translocated to the P site with the aid the elongation factor G (EF-G) and GTP as the deacylated tRNA is released from the ribosome through one or more exit sites [PUBMED:11297922, PUBMED:11290319]. About 2/3 of the mass of the ribosome consists of RNA and 1/3 of protein. The proteins are named in accordance with the subunit of the ribosome which they belong to - the small (S1 to S31) and the large (L1 to L44). Usually they decorate the rRNA cores of the subunits. Many ribosomal proteins, particularly those of the large subunit, are composed of a globular, surfaced-exposed domain with long finger-like projections that extend into the rRNA core to stabilise its structure. Most of the proteins interact with multiple RNA elements, often from different domains. In the large subunit, about 1/3 of the 23S rRNA nucleotides are at least in van der Waal’s contact with protein, and L22 interacts with all six domains of the 23S rRNA. Proteins S4 and S7, which initiate assembly of the 16S rRNA, are located at junctions of five and four RNA helices, respectively. In this way proteins serve to organise and stabilise the rRNA tertiary structure. While the crucial activities of decoding and peptide transfer are RNA based, proteins play an active role in functions that may have evolved to streamline the process of protein synthesis. In addition to their function in the ribosome, many ribosomal proteins have some function ‘outside’ the ribosome [PUBMED:11290319, PUBMED:11114498]. This entry represents the C-terminal domain of the large subunit ribosomal proteins, known as the L7/L12 family. L7/L12 is present in each 50S subunit in four copies organised as two dimers. The L8 protein complex consisting of two dimers of L7/L12 and L10 in Escherichia coli ribosomes is assembled on the conserved region of 23 S rRNA termed the GTPase-associated domain [PUBMED:10488095]. The L7/L12 dimer probably interacts with EF-Tu. L7 and L12 only differ in a single post translational modification of the addition of an acetyl group to the N terminus of L7.
  7. Ribosomal protein L6
    Ribosomes are the particles that catalyse mRNA-directed protein synthesis in all organisms. The codons of the mRNA are exposed on the ribosome to allow tRNA binding. This leads to the incorporation of amino acids into the growing polypeptide chain in accordance with the genetic information. Incoming amino acid monomers enter the ribosomal A site in the form of aminoacyl-tRNAs complexed with elongation factor Tu (EF-Tu) and GTP. The growing polypeptide chain, situated in the P site as peptidyl-tRNA, is then transferred to aminoacyl-tRNA and the new peptidyl-tRNA, extended by one residue, is translocated to the P site with the aid the elongation factor G (EF-G) and GTP as the deacylated tRNA is released from the ribosome through one or more exit sites [PUBMED:11297922, PUBMED:11290319]. About 2/3 of the mass of the ribosome consists of RNA and 1/3 of protein. The proteins are named in accordance with the subunit of the ribosome which they belong to - the small (S1 to S31) and the large (L1 to L44). Usually they decorate the rRNA cores of the subunits. Many ribosomal proteins, particularly those of the large subunit, are composed of a globular, surfaced-exposed domain with long finger-like projections that extend into the rRNA core to stabilise its structure. Most of the proteins interact with multiple RNA elements, often from different domains. In the large subunit, about 1/3 of the 23S rRNA nucleotides are at least in van der Waal’s contact with protein, and L22 interacts with all six domains of the 23S rRNA. Proteins S4 and S7, which initiate assembly of the 16S rRNA, are located at junctions of five and four RNA helices, respectively. In this way proteins serve to organise and stabilise the rRNA tertiary structure. While the crucial activities of decoding and peptide transfer are RNA based, proteins play an active role in functions that may have evolved to streamline the process of protein synthesis. In addition to their function in the ribosome, many ribosomal proteins have some function ‘outside’ the ribosome [PUBMED:11290319, PUBMED:11114498].
    L6 is a protein from the large (50S) subunit. In Escherichia coli, it is located in the aminoacyl-tRNA binding site of the peptidyltransferase centre, and is known to bind directly to 23S rRNA. It belongs to a family of ribosomal proteins, including L6 from bacteria, cyanelles (structures that perform similar functions to chloroplasts, but have structural and biochemical characteristics of Cyanobacteria) and mitochondria; and L9 from mammals, Drosophila, plants and yeast. L6 contains two domains with almost identical folds, suggesting that is was derived by the duplication of an ancient RNA-binding protein gene. Analysis reveals several sites on the protein surface where interactions with other ribosome components may occur, the N terminus being involved in protein-protein interactions and the C terminus containing possible RNA-binding sites [PUBMED:8262035]. This entry represents the alpha-beta domain found duplicated in ribosomal L6 proteins. This domain consists of two beta-sheets and one alpha-helix packed around single core [PUBMED:8262035].
  8. Homoserine dehydrogenase
    Bacteria, plants and fungi metabolise aspartic acid to produce four amino acids - lysine, threonine, methionine and isoleucine - in a series of reactions known as the aspartate pathway. Additionally, several important metabolic intermediates are produced by these reactions, such as diaminopimelic acid, an essential component of bacterial cell wall biosynthesis, and dipicolinic acid, which is involved in sporulation in Gram-positive bacteria. Members of the animal kingdom do not posses this pathway and must therefore acquire these essential amino acids through their diet. Research into improving the metabolic flux through this pathway has the potential to increase the yield of the essential amino acids in important crops, thus improving their nutritional value. Additionally, since the enzymes are not present in animals, inhibitors of them are promising targets for the development of novel antibiotics and herbicides. For more information see [PUBMED:11352712]. Homoserine dehydrogenase (EC) catalyses the third step in the aspartate pathway; theNAD(P)-dependent reduction of aspartate beta-semialdehyde into homoserine [PUBMED:8500624, PUBMED:8395899]. Homoserine is an intermediate in the biosynthesis of threonine, isoleucine, and methionine. The enzyme can be found in a monofunctional form, in some bacteria and yeast, or a bifunctional form consisting of an N-terminal aspartokinase domain and a C-terminal homoserine dehydrogenase domain, as found in bacteria such as Escherichia coli and in plants. Structural analysis of the yeast monofunctional enzyme (SWISSPROT) indicates that the enzyme is a dimer composed of three distinct regions; an N-terminal nucleotide-binding domain, a short central dimerisation region, and a C-terminal catalytic domain [PUBMED:10700284]. The N-terminal domain forms a modified Rossman fold, while the catalytic domain forms a novel alpha-beta mixed sheet. This entry represents the catalytic domain of homoserine dehydrogenase.
  9. NifU-like domain
    Iron-sulphur (FeS) clusters are important cofactors for numerous proteins involved in electron transfer, in redox and non-redox catalysis, in gene regulation, and as sensors of oxygen and iron. These functions depend on the various FeS cluster prosthetic groups, the most common being [2Fe-2S] and [4Fe-4S] [PUBMED:16221578]. FeS cluster assembly is a complex process involving the mobilisation of Fe and S atoms from storage sources, their assembly into [Fe-S] form, their transport to specific cellular locations, and their transfer to recipient apoproteins. So far, three FeS assembly machineries have been identified, which are capable of synthesising all types of [Fe-S] clusters: ISC (iron-sulphur cluster), SUF (sulphur assimilation), and NIF (nitrogen fixation) systems. The ISC system is conserved in eubacteria and eukaryotes (mitochondria), and has broad specificity, targeting general FeS proteins [PUBMED:16211402, PUBMED:16843540]. It is encoded by the isc operon (iscRSUA-hscBA-fdx-iscX). IscS is a cysteine desulphurase, which obtains S from cysteine (converting it to alanine) and serves as a S donor for FeS cluster assembly. IscU and IscA act as scaffolds to accept S and Fe atoms, assembling clusters and transfering them to recipient apoproteins. HscA is a molecular chaperone and HscB is a co-chaperone. Fdx is a [2Fe-2S]-type ferredoxin. IscR is a transcription factor that regulates expression of the isc operon. IscX (also known as YfhJ) appears to interact with IscS and may function as an Fe donor during cluster assembly [PUBMED:15937904]. The SUF system is an alternative pathway to the ISC system that operates under iron starvation and oxidative stress. It is found in eubacteria, archaea and eukaryotes (plastids). The SUF system is encoded by the suf operon (sufABCDSE), and the six encoded proteins are arranged into two complexes (SufSE and SufBCD) and one protein (SufA). SufS is a pyridoxal-phosphate (PLP) protein displaying cysteine desulphurase activity. SufE acts as a scaffold protein that accepts S from SufS and donates it to SufA [PUBMED:17350000]. SufC is an ATPase with an unorthodox ATP-binding cassette (ABC)-like component. No specific functions have been assigned to SufB and SufD. SufA is homologous to IscA [PUBMED:15278785], acting as a scaffold protein in which Fe and S atoms are assembled into [FeS] cluster forms, which can then easily be transferred to apoproteins targets. In the NIF system, NifS and NifU are required for the formation of metalloclusters of nitrogenase in Azotobacter vinelandii, and other organisms, as well as in the maturation of other FeS proteins. Nitrogenase catalyses the fixation of nitrogen. It contains a complex cluster, the FeMo cofactor, which contains molybdenum, Fe and S. NifS is a cysteine desulphurase. NifU binds one Fe atom at its N-terminal, assembling an FeS cluster that is transferred to nitrogenase apoproteins [PUBMED:11498000]. Nif proteins involved in the formation of FeS clusters can also be found in organisms that do not fix nitrogen [PUBMED:8875867]. This entry represents the C-terminal of NifU and homologous proteins. NifU contains two domains: an N-terminal (INTERPRO) and a C-terminal domain [PUBMED:8048161]. These domains exist either together or on different polypeptides, both domains being found in organisms that do not fix nitrogen (e.g. yeast), so they have a broader significance in the cell than nitrogen fixation.
  10. Clp amino terminal domain, pathogenicity island component
    ClpA is an ATP-dependent chaperone and part of the ClpAP protease that participates in regulatory protein degradation and the dissolution and degradation of protein aggregates [PUBMED:2186030]. ClpA recognises sequences in specific proteins, which it then unfolds in an ATP-dependent manner and transports into the degradation chamber of the associated ClpP protein [PUBMED:10485712, PUBMED:11287666]. A small adaptor-like protein, ClpS, modulates the activity of ClpA and is an important regulatory factor for this protein [PUBMED:12235156]. It protects ClpA from autodegradation and appears to redirect its activity away from soluble proteins and toward aggregated proteins. This entry represents the double Clp-N motif domain found at the N terminus of ATP-dependent Clp proteases. This N-terminal domain interacts with the D1 domain found in Cpl proteases in a fashion similar to that seen in adaptor-binding domains of other AAA(+) proteins [PUBMED:12205096].

SUMMARY:
- very distinct function only in bacteria, plants and fungi: Homoserine dehydrogenase plays a role in the unique aspartic acid pathway
- electron transfer in redox and non-redox catalysis, in gene regulation, and as sensors of oxygen and iron (NifU-like domain is involved in the Fe-S cluster formation)

Top-10 mitochondrial PFAM-Families

top_mitochondrial = subset(d_superkingdom, is_mitochondrial==TRUE)
(top_mitochondrial <- top_mitochondrial[order(top_mitochondrial$count,decreasing=T)[1:TOP],])
##     Superkingdom PFAM_Name                                       PFAM_desc
## 827    Eukaryota     PPR_2                               PPR repeat family
## 353    Eukaryota       PPR                                      PPR repeat
## 867    Eukaryota     PPR_3                 Pentatricopeptide repeat domain
## 818    Eukaryota     PPR_1                                      PPR repeat
## 127    Eukaryota      WD40                        WD domain, G-beta repeat
## 856    Eukaryota EF-hand_7                             EF-hand domain pair
## 158    Eukaryota     TPR_1                        Tetratricopeptide repeat
## 77     Eukaryota Mito_carr                   Mitochondrial carrier protein
## 152    Eukaryota   HMG_box                   HMG (high mobility group) box
## 71     Eukaryota   Hexapep Bacterial transferase hexapeptide (six repeats)
##       model is_chloroplastic is_mitochondrial count
## 827 PF13041            FALSE             TRUE   137
## 353 PF01535            FALSE             TRUE   131
## 867 PF13812            FALSE             TRUE    58
## 818 PF12854            FALSE             TRUE    53
## 127 PF00400            FALSE             TRUE    25
## 856 PF13499            FALSE             TRUE    11
## 158 PF00515            FALSE             TRUE     9
## 77  PF00153            FALSE             TRUE     8
## 152 PF00505            FALSE             TRUE     7
## 71  PF00132            FALSE             TRUE     6
  1. PPR repeat family (PPR_2)
    This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR [1].
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  2. PPR repeat (PPR)
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  3. Pentatricopeptide repeat domain (PPR_3)
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  4. PPR repeat (PPR_1)
    This entry represents the PPR repeat. Pentatricopeptide repeat (PPR) proteins are characterised by tandem repeats of a degenerate 35 amino acid motif [PUBMED:10664580]. Most of PPR proteins have roles in mitochondria or plastid [PUBMED:15270678]. PPR repeats were discovered while screening Arabidopsis proteins for those predicted to be targeted to mitochondria or chloroplast [PUBMED:10664580, PUBMED:15269332]. Some of these proteins have been shown to play a role in post-transcriptional processes within organelles and they are thought to be sequence-specific RNA-binding proteins [PUBMED:12782738, PUBMED:12832482, PUBMED:18031283]. Plant genomes have between one hundred to five hundred PPR genes per genome whereas non-plant genomes encode two to six PPR proteins. Although no PPR structures are yet known, the motif is predicted to fold into a helix-turn-helix structure similar to those found in the tetratricopeptide repeat (TPR) family (see PROSITEDOC) [PUBMED:10664580]. The plant PPR protein family has been divided in two subfamilies on the basis of their motif content and organisation [PUBMED:15269332, PUBMED:17560114]. Examples of PPR repeat-containing proteins include PET309 SWISSPROT, which may be involved in RNA stabilisation [PUBMED:7664742], and crp1, which is involved in RNA processing [PUBMED:8039510]. The repeat is associated with a predicted plant protein SWISSPROT that has a domain organisation similar to the human BRCA1 protein.
  5. WD domain, G-beta repeat
    WD-40 repeats (also known as WD or beta-transducin repeats) are short ~40 amino acid motifs, often terminating in a Trp-Asp (W-D) dipeptide. WD40 repeats usually assume a 7-8 bladed beta-propeller fold, but proteins have been found with 4 to 16 repeated units, which also form a circularised beta-propeller structure. WD-repeat proteins are a large family found in all eukaryotes and are implicated in a variety of functions ranging from signal transduction and transcription regulation to cell cycle control and apoptosis. Repeated WD40 motifs act as a site for protein-protein interaction, and proteins containing WD40 repeats are known to serve as platforms for the assembly of protein complexes or mediators of transient interplay among other proteins. The specificity of the proteins is determined by the sequences outside the repeats themselves. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase [PUBMED:11814058, PUBMED:10322433]. In Arabidopsis spp., several WD40-containing proteins act as key regulators of plant-specific developmental events. One group of WD40 repeats is involved in RNA-processing complexes, some members forming part of snRNP particles6. Another group makes up parts of transcriptional regulators, including the TFIID subunit of the TATA-box-binding complex7, 8, 9. Others play roles in cytoskeleton assembly and mitotic-spindle formation10, 11, regulate vesicle formation and vesicular trafficking12, control various aspects of cell division13 or regulate sulfur metabolism in fungi14. Examples of such complexes are G proteins (beta subunit is a beta-propeller), TAFII transcription factor, and E3 ubiquitin ligase.[3][4]
  6. EF-hand domain pair
    The EF hand is a helix-loop-helix structural domain or motif found in a large family of calcium-binding proteins. The EF-hand motif contains a helix-loop-helix topology, much like the spread thumb and forefinger of the human hand, in which the Ca2+ ions are coordinated by ligands within the loop. The motif takes its name from traditional nomenclature used in describing the protein parvalbumin, which contains three such motifs and is probably involved in muscle relaxation via its calcium-binding activity. The EF-hand consists of two alpha helices linked by a short loop region (usually about 12 amino acids) that usually binds calcium ions. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. EF-hands also appear in each structural domain of the signaling protein calmodulin and in the muscle protein troponin-C. example: Aequorin is a calcium binding protein (CaBP) isolated from the coelenterate Aequorea victoria. Aequorin belongs to the EF-hand family of CaBPs, with EF-hand loops that are closely related to CaBPs in mammals. In addition, aequorin has been used for years as an indicator of Ca2+ and has been shown to be safe and well tolerated by cells. Aequorin is made up of two components – the calcium binding component apoaequorin (AQ) and the chemiluminescent molecule coelenterazine. The AQ portion of this protein contains the EF-hand calcium binding domains.[2]
  7. Tetratricopeptide repeat
    The tetratrico peptide repeat (TPR) is a structural motif present in a wide range of proteins [PUBMED:7667876, PUBMED:9482716, PUBMED:1882418]. It mediates protein-protein interactions and the assembly of multiprotein complexes [PUBMED:14659697]. The TPR motif consists of 3-16 tandem-repeats of 34 amino acids residues, although individual TPR motifs can be dispersed in the protein sequence. Sequence alignment of the TPR domains reveals a consensus sequence defined by a pattern of small and large amino acids. TPR motifs have been identified in various different organisms, ranging from bacteria to humans. Proteins containing TPRs are involved in a variety of biological processes, such as cell cycle regulation, transcriptional control, mitochondrial and peroxisomal protein transport, neurogenesis and protein folding. The X-ray structure of a domain containing three TPRs from protein phosphatase 5 revealed that TPR adopts a helix-turn-helix arrangement, with adjacent TPR motifs packing in a parallel fashion, resulting in a spiral of repeating anti-parallel alpha-helices [PUBMED:14659697]. The two helices are denoted helix A and helix B. The packing angle between helix A and helix B is ~24 degrees; within a single TPR and generates a right-handed superhelical shape. Helix A interacts with helix B and with helix A’ of the next TPR. Two protein surfaces are generated: the inner concave surface is contributed to mainly by residue on helices A, and the other surface presents residues from both helices A and B.
  8. Mitochondrial carrier protein
    Mitochondrial carriers are proteins from a solute carrier family which transfer molecules across the membranes of the mitochondria.[1] Mitochondrial carriers are also classified in the Transporter Classification Database. The Mitochondrial Carrier (MC) Superfamily has been expanded to include both the original Mitochondrial Carrier (MC) family (TC# 2.A.29) and the Mitochondrial Inner/Outer Membrane Fusion (MMF) family (TC# 9.B.25).[2]
    A variety of substrate carrier proteins that are involved in energy transfer are found in the inner mitochondrial membrane or integral to the membrane of other eukaryotic organelles such as the peroxisome [PUBMED:2158156, PUBMED:8140286, PUBMED:8487299, PUBMED:8206158, PUBMED:8291088]. Such proteins include: ADP, ATP carrier protein (ADP/ATP translocase); 2-oxoglutarate/malate carrier protein; phosphate carrier protein; tricarboxylate transport protein (or citrate transport protein); Graves disease carrier protein; yeast mitochondrial proteins MRS3 and MRS4; yeast mitochondrial FAD carrier protein; and many others. Structurally, these proteins can consist of up to three tandem repeats of a domain of approximately 100 residues, each domain containing two transmembrane regions.
  9. HMG (high mobility group) box
    In molecular biology, the HMG-box (high mobility group box) is a protein domain which is involved in DNA binding.[1] HMG-box containing proteins only bind non-B-type DNA conformations (kinked or unwound) with high affinity.[1] HMG-box domains are found in high mobility group proteins, which are involved in the regulation of DNA-dependent processes such as transcription, replication, and DNA repair, all of which require changing the conformation of chromatin.[2] The single and the double box HMG proteins alter DNA architecture by inducing bends upon binding.[3][4]
    High mobility group (HMG) box domains are involved in binding DNA, and may be involved in protein-protein interactions as well. The structure of the HMG-box domain consists of three helices in an irregular array. HMG-box domains are found in one or more copies in HMG-box proteins, which form a large, diverse family involved in the regulation of DNA-dependent processes such as transcription, replication, and strand repair, all of which require the bending and unwinding of chromatin. Many of these proteins are regulators of gene expression. HMG-box proteins are found in a variety of eukaryotic organisms, and can be broadly divided into two groups, based on sequence-dependent and sequence-independent DNA recognition; the former usually contain one HMG-box motif, while the latter can contain multiple HMG-box motifs. HMG-box domains can be found in single or multiple copies in the following protein classes: HMG1 and HMG2 non-histone components of chromatin; SRY (sex determining region Y protein) involved in differential gonadogenesis; the SOX family of transcription factors [PUBMED:12920151]; sequence-specific LEF1 (lymphoid enhancer binding factor 1) and TCF-1 (T-cell factor 1) involved in regulation of organogenesis and thymocyte differentiation [PUBMED:10890911]; structure-specific recognition protein SSRP involved in transcription and replication; MTF1 mitochondrial transcription factor; nucleolar transcription factors UBF 1/2 (upstream binding factor) involved in transcription by RNA polymerase I; Abf2 yeast ARS-binding factor [PUBMED:11779632]; yeast transcription factors lxr1, Rox1, Nhp6b and Spp41; mating type proteins (MAT) involved in the sexual reproduction of fungi [PUBMED:12781674]; and the YABBY plant-specific transcription factors.
  10. Bacterial transferase hexapeptide (six repeats) A variety of bacterial transferases contain a repeat structure composed of tandem repeats of a [LIV]-G-X(4) hexapeptide, which, in the tertiary structure of LpxA (UDP N-acetylglucosamine acyltransferase) [PUBMED:7481807], has been shown to form a left-handed parallel beta helix. A number of different transferase protein families contain this repeat, such as
    galactoside acetyltransferase-like proteins [PUBMED:11937062]: The galactoside acetyltransferase (thiogalactoside transacetylase) of Escherichia coli (GAT, LacA, EC 2.3.1.18) is a gene product of the classical lac operon. GAT may assist cellular detoxification by acetylating nonmetabolizable pyranosides, thereby preventing their reentry into the cell.
    the gamma-class of carbonic anhydrases [PUBMED:10924115]: Carbonic anhydrases (CA: EC:4.2.1.1) are zinc metalloenzymes which catalyse the reversible hydration of carbon dioxide to bicarbonate [PMID: 18336305, PMID: 10978542].
    and tetrahydrodipicolinate-N-succinlytransferases (DapD) the latter containing an extra N-terminal 3-helical domain [PUBMED:11910040]: Tetrahydrodipicolinate N-succinyltransferase (DapD) catalyzes the succinyl-CoA-dependent acylation of L-2-amino-6-oxopimelate to 2-N-succinyl-6-oxopimelate as part of the succinylase branch of the meso-diaminopimelate/lysine biosynthetic pathway of bacteria, blue-green algae, and plants. This pathway provides meso-diaminopimelate as a building block for cell wall peptidoglycan in most bacteria, and is regarded as a target pathway for antibacterial agents.

SUMMARY:
- DNA-binding & gene regulating (HMG box: changing conformation of chromatin)

Archaea vs. Bacteria

# # select only archaea and bacteria entries
# df_spec <- df %>%
#   group_by(Superkingdom) %>%
#   filter(Superkingdom == "Archaea" | Superkingdom == "Bacteria" & rel_freq >= 0.02)
# df_spec
# 
# ## make plot
# pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
#   geom_point(position = pd)+
#   geom_line(aes(group = PFAM_desc), 
#             position = pd)+
#   scale_x_discrete(position = "top")+
#   scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'))+
#   labs(x = "Superkingdom",
#        y = "Relative Frequency",
#        color = "PFAM Family",
#        title = "Prokaryota")+
#   theme(legend.position="right",
#         plot.margin = margin(l = 0, r=0))+
#   guides(color=guide_legend(nrow=15, byrow=TRUE))
# p <- beautifier(p)
# p
# if( save) {
#   ggsave(paste0(pathImages, "PFAM_archaea_bacteria", figureFormat), width=12, height=8, dpi = 300)
# }
# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
          summarize, 
          count=length(ID))

top_bacteria = subset(d_superkingdom, Superkingdom=="Bacteria")
top_bacteria <- top_bacteria[order(top_bacteria$count,decreasing=T)[1:TOP],]

top_archaea = subset(d_superkingdom, Superkingdom=="Archaea")
top_archaea <- top_archaea[order(top_archaea$count,decreasing=T)[1:TOP],]

# extend bacteria with entries which appear in top-archaea but not in top-bacteria and vice versa.
x <- extend_top_hits(top_bacteria, top_archaea, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

# Aggregate archaea and bacteria entries and calculate relative frequency
# df_spec <- rbind(pfam_archaea, pfam_bacteria) %>%
#   group_by(Superkingdom, PFAM_desc) %>%
#   summarise(n=sum(count)) %>%
#   mutate(rel_freq = n/sum(n))

# # Select only those above a certain threshold (Visual Cosmetics)
# df_spec <- df_spec %>%
#   group_by(Superkingdom) %>%
#   filter(Superkingdom == "Archaea" | Superkingdom == "Bacteria" & rel_freq >= 0.01)
# df_spec

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  scale_color_manual(values = morecolors2)+
  # scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family",
       title = "Prokaryota")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_archaea_bacteria", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_archaea_bacteria", ".html"))
}
p

The two connected Archaea points, have two different PFAM_Name:
Archaea Fer4_10 4Fe-4S dicluster domain PF13237 7 4.0
Archaea Fer4_7 4Fe-4S dicluster domain PF12838 13 9.0

Chloroplasts vs Prokaryota

# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
# d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
#           summarize, 
#           count=length(ID))
# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model, is_chloroplastic),
          summarize,
          count=length(ID))

top_bacteria = subset(d_superkingdom, Superkingdom=="Bacteria")
top_bacteria <- top_bacteria[order(top_bacteria$count,decreasing=T)[1:TOP],]

top_archaea = subset(d_superkingdom, Superkingdom=="Archaea")
top_archaea <- top_archaea[order(top_archaea$count,decreasing=T)[1:TOP],]

top_eukaryota = subset(d_superkingdom, Superkingdom=="Eukaryota")
top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],]

# combine archaeal and bacterial top-hits and rename them to prokaryota
top_prokaryota <- rbind(top_archaea, top_bacteria) %>% 
  mutate(Superkingdom = replace(Superkingdom, Superkingdom == "Archaea" | Superkingdom == "Bacteria", "Prokaryota"))



top_chloroplastic = subset(d_superkingdom, is_chloroplastic==TRUE)
(top_chloroplastic <- top_chloroplastic[order(top_chloroplastic$count,decreasing=T)[1:TOP],])
##     Superkingdom     PFAM_Name
## 816    Eukaryota         PPR_2
## 347    Eukaryota           PPR
## 48     Eukaryota         RRM_1
## 851    Eukaryota         PPR_3
## 808    Eukaryota         PPR_1
## 166    Eukaryota Ribosomal_L12
## 107    Eukaryota  Ribosomal_L6
## 238    Eukaryota Homoserine_dh
## 281    Eukaryota          NifU
## 472    Eukaryota         Clp_N
##                                                     PFAM_desc   model
## 816                                         PPR repeat family PF13041
## 347                                                PPR repeat PF01535
## 48    RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) PF00076
## 851                           Pentatricopeptide repeat domain PF13812
## 808                                                PPR repeat PF12854
## 166                Ribosomal protein L7/L12 C-terminal domain PF00542
## 107                                      Ribosomal protein L6 PF00347
## 238                                  Homoserine dehydrogenase PF00742
## 281                                          NifU-like domain PF01106
## 472 Clp amino terminal domain, pathogenicity island component PF02861
##     is_chloroplastic count
## 816             TRUE    55
## 347             TRUE    53
## 48              TRUE    14
## 851             TRUE    14
## 808             TRUE    11
## 166             TRUE     5
## 107             TRUE     4
## 238             TRUE     4
## 281             TRUE     4
## 472             TRUE     4
# extend bacteria with entries which appear in top-archaea but not in top-bacteria and vice versa.
x <- extend_top_hits(top_prokaryota, top_chloroplastic, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  scale_color_manual(values = morecolors2)+
  # scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family",
       title = "Eukaryotic Proteins with Chloroplastic origin vs Prokaryotic proteins")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_chloroplasts_vs_Prokaryota", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_chloroplasts_vs_Prokaryota", ".html"))
}
p

Bacterial transferase hexapeptide is the protein family which is found at most in both chloroplastic and prokaryotic TRs.
PPR TRs are only found in Eukaryotic proteins with chloroplastic origin.
Ribosomal L6 protein can be found in both groups, however it occurs on a higher rank in Prokaryotes. Ribosomal L7/L12-C-terminal domain can only be found in TRs from Eukaryotic Chloroplasts.

Prokaryota vs Eukaryota

# d = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model, is_chloroplastic, is_mitochondrial), 
#           summarize, 
#           count=length(ID)) 
# d_spec <- d %>%
#   group_by(Superkingdom) %>%
#   mutate(Superkingdom2 = ifelse(Superkingdom == "Archaea" | Superkingdom == "Bacteria", "Prokaryota", 
#                                 ifelse(Superkingdom == "Eukaryota", "Eukaryota",
#                                        ifelse(Superkingdom == "Viruses", "Viruses", NA))))
# 
# top_prokaryota <- subset(d_spec, Superkingdom2=="Prokaryota")
# top_prokaryota <- top_prokaryota[order(top_prokaryota$count,decreasing=T)[1:TOP],]
# 
# top_virus = subset(d_spec, Superkingdom2=="Viruses")
# top_virus <- top_virus[order(top_virus$count,decreasing=T)[1:TOP],]
# 
# top_eukaryota = subset(d_spec, Superkingdom2=="Eukaryota")
# top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],]
# 
# ### Prokaryota vs Eukaryota
# # extend Prokaryota with Eukaryota
# pfam_prokaryota <- top_prokaryota
# for (i in 1:nrow(top_eukaryota)){
#   # check for each row in top_eukaryota if not top_eukaryota$PFAM_desc[i] in top_bacteria$PFAM_desc[i]
#   if (!(top_eukaryota$model[i] %in% top_prokaryota$model)){
#     # if not, then append the corresponding PFAM-entry of bacteria
#     pfam_prokaryota <- rbind(pfam_prokaryota, d_spec[which(d_spec$model == top_eukaryota$model[i] & d_spec$Superkingdom2 == "Prokaryota"),])
#   }
# }
# top_eukaryota$model %in% top_prokaryota$model
# pfam_prokaryota
# top_eukaryota
# top_prokaryota
# 
# # extend Eukaryota with Prokaryota
# pfam_eukaryota <- top_eukaryota
# for (i in 1:nrow(top_prokaryota)){
#   # check for each row in top_eukaryota if not top_eukaryota$PFAM_desc[i] in top_prokaryota$PFAM_desc[i]
#   if (!(top_prokaryota$model[i] %in% top_eukaryota$model)){
#     # if not, then append the corresponding PFAM-entry of bacteria
#     pfam_eukaryota <- rbind(pfam_eukaryota, d_spec[which(d_spec$model == top_prokaryota$model[i] & d_spec$Superkingdom == "Eukaryota"),])
#   }
# }
# # select only archaea and bacteria entries
# df_spec <- df %>%
#   group_by(Superkingdom) 
# df_spec
# 
# # recalculate percentage
# df_spec <- df %>%
#   group_by(Superkingdom2, PFAM_desc) %>%
#   distinct() %>%
#   summarise(n2=sum(n)) %>%
#   mutate(rel_freq2 = n2/sum(n2))%>%
#   filter((Superkingdom2 == "Prokaryota" | Superkingdom2 == "Eukaryota") & rel_freq2 >= 0.01)
# df_spec
# 
# ## make plot
# pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom2, y = rel_freq2, color = PFAM_desc))+
#   geom_point(position = pd)+
#   geom_line(aes(group = PFAM_desc), 
#             position = pd)+
#   scale_x_discrete(position = "top")+
#   # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'))+
#   labs(x = "Superkingdom",
#        y = "Relative Frequency",
#        color = "PFAM Family")+
#   theme(legend.position="right",
#         plot.margin = margin(l = 0, r=0))+
#   guides(color=guide_legend(nrow=14, byrow=TRUE))
# p <- beautifier(p)
# p
# if( save) {
#   ggsave(paste0(pathImages, "PFAM_prokaryota_eukaryota", figureFormat), width=12, height=8, dpi = 300)
# }
# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
          summarize, 
          count=length(ID))

top_bacteria = subset(d_superkingdom, Superkingdom=="Bacteria")
top_bacteria <- top_bacteria[order(top_bacteria$count,decreasing=T)[1:TOP],]

top_archaea = subset(d_superkingdom, Superkingdom=="Archaea")
top_archaea <- top_archaea[order(top_archaea$count,decreasing=T)[1:TOP],]

top_eukaryota = subset(d_superkingdom, Superkingdom=="Eukaryota")
top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],]

# combine archaeal and bacterial top-hits and rename them to prokaryota
top_prokaryota <- rbind(top_archaea, top_bacteria) %>% 
  mutate(Superkingdom = replace(Superkingdom, Superkingdom == "Archaea" | Superkingdom == "Bacteria", "Prokaryota"))

# extend top-eukaryota with entries which appear in top-prokaryota but not in top-eukaryota and vice versa.
x <- extend_top_hits(top_prokaryota, top_eukaryota, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  # scale_color_manual(values = morecolors2)+
  # scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_prokaryota_eukaryota", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_prokaryota_eukaryota", ".html"))
}
p

Viruses vs eukaryota

# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
          summarize, 
          count=length(ID))

top_virus = subset(d_superkingdom, Superkingdom=="Viruses")
top_virus <- top_virus[order(top_virus$count,decreasing=T)[1:TOP],]

top_eukaryota = subset(d_superkingdom, Superkingdom=="Eukaryota")
top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],]

# extend bacteria with entries which appear in top-archaea but not in top-bacteria and vice versa.
x <- extend_top_hits(top_virus, top_eukaryota, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  # scale_color_manual(values = morecolors2)+
  scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_viruses_eukaryota", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_viruses_eukaryota", ".html"))
}
p

Proteins with TRs of Eukaryota and Viruses have very distinct PFAM-families. They share from the top-hits only the Zink-knuckle and WD40 domain repeats. In contrast to Archaeas vs Viruses, Eukaryota vs Viruses don’t share the Transcription-factor TFIIB repeat.
Except for WD40, all top-hits from eukaryota are not found in viruses.

Viruses vs Archaea

# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
          summarize, 
          count=length(ID))

top_virus = subset(d_superkingdom, Superkingdom=="Viruses")
top_virus <- top_virus[order(top_virus$count,decreasing=T)[1:TOP],]

top_archaea = subset(d_superkingdom, Superkingdom=="Archaea")
top_archaea <- top_archaea[order(top_archaea$count,decreasing=T)[1:TOP],]

# extend bacteria with entries which appear in top-archaea but not in top-bacteria and vice versa.
x <- extend_top_hits(top_virus, top_archaea, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  # scale_color_manual(values = morecolors2)+
  scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_viruses_archaea", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_viruses_archaea", ".html"))
}
p

Proteins with TRs of Archaea and Viruses have very distinct PFAM-families. They share from the top-hits only the Transcription factor TFIIB repeat.

Bacteria vs Eukaryota

# Summarize by Superkingdom, without Chloroplastic and mitochondrial information to reduce dublicates 
d_superkingdom = ddply(tr_all_sp_sub, .(Superkingdom, PFAM_Name, PFAM_desc, model), 
          summarize, 
          count=length(ID))

top_bacteria = subset(d_superkingdom, Superkingdom=="Bacteria")
(top_bacteria <- top_bacteria[order(top_bacteria$count,decreasing=T)[1:TOP],])
##     Superkingdom       PFAM_Name
## 157     Bacteria         Hexapep
## 187     Bacteria            MraZ
## 237     Bacteria    Ribosomal_L6
## 196     Bacteria    NTP_transf_3
## 158     Bacteria       Hexapep_2
## 206     Bacteria            PD40
## 55      Bacteria Acetyltransf_11
## 177     Bacteria            LpxD
## 264     Bacteria          TolB_N
## 106     Bacteria   DNA_gyraseA_C
##                                                            PFAM_desc
## 157                  Bacterial transferase hexapeptide (six repeats)
## 187                            MraZ protein, putative antitoxin-like
## 237                                             Ribosomal protein L6
## 196                                 MobA-like NTP transferase domain
## 158                       Hexapeptide repeat of succinyl-transferase
## 206                                  WD40-like Beta Propeller Repeat
## 55               Udp N-acetylglucosamine O-acyltransferase; Domain 2
## 177 UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase, LpxD
## 264                                       TolB amino-terminal domain
## 106                     DNA gyrase C-terminal domain, beta-propeller
##       model count
## 157 PF00132   928
## 187 PF02381   320
## 237 PF00347   317
## 196 PF12804   244
## 158 PF14602   223
## 206 PF07676   164
## 55  PF13720   158
## 177 PF04613   127
## 264 PF04052   115
## 106 PF03989   100
top_eukaryota = subset(d_superkingdom, Superkingdom=="Eukaryota")
top_eukaryota <-top_eukaryota[order(top_eukaryota$count,decreasing=T)[1:TOP],]

# extend bacteria with entries which appear in top-archaea but not in top-bacteria and vice versa.
x <- extend_top_hits(top_bacteria, top_eukaryota, d_superkingdom)

# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  # scale_color_manual(values = morecolors2)+
  scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_bacteria_eukaryota", figureFormat), width=12, height=8, dpi = 300)
}

p <- make_it_plotly(df_spec)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_bacteria_eukaryota", ".html"))
}
p

Bacteria has more TR regions in Transferase-clusters (Bacterial transferase hexapeptide and Hexapeptide repeat of succinyl-transferase) than eukaryota.

Overall

# df <- unique(as.character(top_eukaryota$PFAM_desc, top_archaea$PFAM_desc, top_bacteria$PFAM_desc, top_virus$PFAM_desc))
top_PFAM_desc <- unique(as.character(top_eukaryota$PFAM_desc, top_archaea$PFAM_desc, top_bacteria$PFAM_desc, top_virus$PFAM_desc))
df <- top_eukaryota
df <- rbind(df, top_archaea)
df <- rbind(df, top_bacteria)
df <- rbind(df, top_virus)
# df <- rbind(df, top_viridiplantae[,c(1,2,4,8)])
# df <- rbind(df, top_metazoa[,c(1,2,4,8)])
# df <- rbind(df, top_chloroplastic[,c(1,2,4,8)])
# df <- rbind(df, top_mitochondrial[,c(1,2,4,8)])

# # Add percentage
# df <- df %>%
#   group_by(Superkingdom, PFAM_desc) %>%
#   summarise(n=sum(count)) %>%
#   mutate(rel_freq = n/sum(n))

# # Group Archaea and Bacteria to Prokaryota
# df <- df %>%
#   group_by(Superkingdom) %>%
#   mutate(Superkingdom2 = ifelse(Superkingdom == "Archaea" | Superkingdom == "Bacteria", "Prokaryota", 
#                                 ifelse(Superkingdom == "Eukaryota", "Eukaryota",
#                                        ifelse(Superkingdom == "Viruses", "Viruses", NA))))
df
##     Superkingdom       PFAM_Name
## 793    Eukaryota            WD40
## 807    Eukaryota         zf-C2H2
## 550    Eukaryota           LRR_8
## 421    Eukaryota       EF-hand_7
## 696    Eukaryota           RRM_1
## 647    Eukaryota           PPR_2
## 645    Eukaryota             PPR
## 541    Eukaryota             LIM
## 762    Eukaryota           TPR_1
## 763    Eukaryota           TPR_2
## 38       Archaea           TFIIB
## 7        Archaea             CBS
## 12       Archaea            Fer4
## 16       Archaea          Fer4_7
## 26       Archaea     LAGLIDADG_3
## 18       Archaea         Hexapep
## 37       Archaea    TF_Zn_Ribbon
## 13       Archaea         Fer4_10
## 32       Archaea   Rad50_zn_hook
## 34       Archaea    Ribosomal_L6
## 157     Bacteria         Hexapep
## 187     Bacteria            MraZ
## 237     Bacteria    Ribosomal_L6
## 196     Bacteria    NTP_transf_3
## 158     Bacteria       Hexapep_2
## 206     Bacteria            PD40
## 55      Bacteria Acetyltransf_11
## 177     Bacteria            LpxD
## 264     Bacteria          TolB_N
## 106     Bacteria   DNA_gyraseA_C
## 882      Viruses         zf-CCHC
## 853      Viruses         Gag_p17
## 872      Viruses             RVP
## 834      Viruses      Adeno_knob
## 835      Viruses     Adeno_shaft
## 836      Viruses             Ank
## 871      Viruses             rve
## 843      Viruses             BTB
## 870      Viruses         RNase_H
## 876      Viruses           Sushi
##                                                            PFAM_desc
## 793                                         WD domain, G-beta repeat
## 807                                           Zinc finger, C2H2 type
## 550                                              Leucine rich repeat
## 421                                              EF-hand domain pair
## 696          RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain)
## 647                                                PPR repeat family
## 645                                                       PPR repeat
## 541                                                       LIM domain
## 762                                         Tetratricopeptide repeat
## 763                                         Tetratricopeptide repeat
## 38                                 Transcription factor TFIIB repeat
## 7                                                         CBS domain
## 12                                             4Fe-4S binding domain
## 16                                           4Fe-4S dicluster domain
## 26                                             LAGLIDADG-like domain
## 18                   Bacterial transferase hexapeptide (six repeats)
## 37                                                TFIIB zinc-binding
## 13                                           4Fe-4S dicluster domain
## 32                                             Rad50 zinc hook motif
## 34                                              Ribosomal protein L6
## 157                  Bacterial transferase hexapeptide (six repeats)
## 187                            MraZ protein, putative antitoxin-like
## 237                                             Ribosomal protein L6
## 196                                 MobA-like NTP transferase domain
## 158                       Hexapeptide repeat of succinyl-transferase
## 206                                  WD40-like Beta Propeller Repeat
## 55               Udp N-acetylglucosamine O-acyltransferase; Domain 2
## 177 UDP-3-O-[3-hydroxymyristoyl] glucosamine N-acyltransferase, LpxD
## 264                                       TolB amino-terminal domain
## 106                     DNA gyrase C-terminal domain, beta-propeller
## 882                                                     Zinc knuckle
## 853                            gag gene protein p17 (matrix protein)
## 872                                     Retroviral aspartyl protease
## 834                           Adenoviral fibre protein (knob domain)
## 835                   Adenoviral fibre protein (repeat/shaft region)
## 836                                                   Ankyrin repeat
## 871                                            Integrase core domain
## 843                                                   BTB/POZ domain
## 870                                                          RNase H
## 876                                        Sushi repeat (SCR repeat)
##       model count
## 793 PF00400  1474
## 807 PF00096   828
## 550 PF13855   587
## 421 PF13499   531
## 696 PF00076   427
## 647 PF13041   417
## 645 PF01535   410
## 541 PF00412   260
## 762 PF00515   194
## 763 PF07719   184
## 38  PF00382    35
## 7   PF00571    22
## 12  PF00037    16
## 16  PF12838    13
## 26  PF14528    11
## 18  PF00132     9
## 37  PF08271     9
## 13  PF13237     7
## 32  PF04423     7
## 34  PF00347     7
## 157 PF00132   928
## 187 PF02381   320
## 237 PF00347   317
## 196 PF12804   244
## 158 PF14602   223
## 206 PF07676   164
## 55  PF13720   158
## 177 PF04613   127
## 264 PF04052   115
## 106 PF03989   100
## 882 PF00098    56
## 853 PF00540    37
## 872 PF00077    13
## 834 PF00541    11
## 835 PF00608    11
## 836 PF00023    11
## 871 PF00665    11
## 843 PF00651    10
## 870 PF00075     9
## 876 PF00084     9
# extend top-eukaryota with entries which appear in top-prokaryota but not in top-eukaryota and vice versa.
# Do this for all four Superkingdoms
x <- extend_top_hits(top_eukaryota, top_archaea, d_superkingdom)
x <- rbind(as.data.frame(x[1]),as.data.frame(x[2])) 
y <- extend_top_hits(top_bacteria, top_virus, d_superkingdom)
y <- rbind(as.data.frame(y[1]),as.data.frame(y[2])) 

df_spec <- extend_top_hits(x, y, d_superkingdom)
# Aggregate prokaryota and eukaryota and calculate rank of count
df_spec <- rbind(as.data.frame(df_spec[1]),as.data.frame(df_spec[2])) 

# Combine identical rows of df_spec
# Identical rows can appear, when certain PFAM_families appear in proteins in mitochondria, chloroplast or none of them or a combination of the three.
# The idea of the following three lines is:
# For each row in df_spec
# check if values in the columns: Superkingdom, PFAM_Name, PFAM_desc, model are the same as in the row before
#    if they are the same (TRUE): add the value in count from row before to the current value.
#    Remove the row before (which was identical expect the count) or keep only the current row or append only current row (with the new count-value) to new data frame
#    if they are not the same (FALSE): append row to new dataframe
df_spec <-df_spec %>%
  group_by(Superkingdom, PFAM_Name, PFAM_desc, model) %>% 
  summarise_all(sum)

# Add the rank of count
df_spec <- df_spec %>%
  group_by(Superkingdom) %>%
  mutate(rank = base::rank(count))
# # Select only those data points which appear in more than 1 Superkingdoms
# df_spec <- df_spec %>%
#   group_by(PFAM_desc) %>%
#   filter(n()>1)
# df_spec

## make plot
pd <- position_dodge(0.4)
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rel_freq, color = PFAM_desc))+
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc))+
# p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = model, fill = PFAM_desc))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  # scale_fill_discrete(name = "PFAM Family", labels = df_spec$PFAM_desc)+
  # scale_color_manual(values = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', cols1, cols2))+
  # scale_color_manual(values = c(cols1, cols2, cols3, cols4, '#a6cee3'))+
  # scale_color_manual(values = morecolors2)+
  # scale_color_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'))+
  # scale_color_manual(values = viridiscols)+
  labs(x = "Superkingdom",
       # y = "Relative Frequency",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  # guides(color=FALSE, fill = guide_legend(nrow=20, byrow=TRUE))
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p

if( save) {
  ggsave(paste0(pathImages, "PFAM_overall", figureFormat), width=12, height=8, dpi = 300)
}
d_superkingdom[grepl(d_superkingdom$PFAM_desc, pattern = "^Ankyrin"),]
##     Superkingdom PFAM_Name                  PFAM_desc   model count
## 64      Bacteria       Ank             Ankyrin repeat PF00023     4
## 65      Bacteria     Ank_2 Ankyrin repeats (3 copies) PF12796     1
## 66      Bacteria     Ank_3             Ankyrin repeat PF13606     3
## 301    Eukaryota       Ank             Ankyrin repeat PF00023    49
## 302    Eukaryota     Ank_2 Ankyrin repeats (3 copies) PF12796    84
## 303    Eukaryota     Ank_3             Ankyrin repeat PF13606    21
## 836      Viruses       Ank             Ankyrin repeat PF00023    11
## 837      Viruses     Ank_2 Ankyrin repeats (3 copies) PF12796     4
## 838      Viruses     Ank_3             Ankyrin repeat PF13606     2
d_superkingdom[grepl(d_superkingdom$PFAM_desc, pattern = "^Tetratricopeptide"),]
##     Superkingdom PFAM_Name
## 40       Archaea     TPR_1
## 41       Archaea     TPR_2
## 42       Archaea     TPR_8
## 266     Bacteria     TPR_1
## 267     Bacteria     TPR_2
## 268     Bacteria     TPR_6
## 269     Bacteria     TPR_7
## 270     Bacteria     TPR_8
## 762    Eukaryota     TPR_1
## 763    Eukaryota     TPR_2
## 764    Eukaryota     TPR_6
## 765    Eukaryota     TPR_7
## 766    Eukaryota     TPR_8
## 774    Eukaryota   TTC5_OB
##                                             PFAM_desc   model count
## 40                           Tetratricopeptide repeat PF00515     3
## 41                           Tetratricopeptide repeat PF07719     2
## 42                           Tetratricopeptide repeat PF13181     4
## 266                          Tetratricopeptide repeat PF00515     9
## 267                          Tetratricopeptide repeat PF07719    43
## 268                          Tetratricopeptide repeat PF13174     2
## 269                          Tetratricopeptide repeat PF13176     5
## 270                          Tetratricopeptide repeat PF13181     9
## 762                          Tetratricopeptide repeat PF00515   194
## 763                          Tetratricopeptide repeat PF07719   184
## 764                          Tetratricopeptide repeat PF13174     9
## 765                          Tetratricopeptide repeat PF13176    15
## 766                          Tetratricopeptide repeat PF13181   104
## 774 Tetratricopeptide repeat protein 5 OB fold domain PF16669     1
df_spec[grepl(df_spec$PFAM_desc, pattern = "^WD"),]
## # A tibble: 3 x 6
## # Groups:   Superkingdom [3]
##   Superkingdom PFAM_Name PFAM_desc                       model  count  rank
##   <chr>        <fct>     <fct>                           <fct>  <int> <dbl>
## 1 Bacteria     PD40      WD40-like Beta Propeller Repeat PF076…   164    10
## 2 Eukaryota    WD40      WD domain, G-beta repeat        PF004…  1474    19
## 3 Viruses      WD40      WD domain, G-beta repeat        PF004…     1     1
d_superkingdom[grepl(d_superkingdom$PFAM_desc, pattern = "^WD"),]
##     Superkingdom  PFAM_Name
## 206     Bacteria       PD40
## 278     Bacteria       WD40
## 618    Eukaryota       PD40
## 748    Eukaryota TFIID_NTD2
## 793    Eukaryota       WD40
## 880      Viruses       WD40
##                                                PFAM_desc   model count
## 206                      WD40-like Beta Propeller Repeat PF07676   164
## 278                             WD domain, G-beta repeat PF00400     9
## 618                      WD40-like Beta Propeller Repeat PF07676     6
## 748 WD40 associated region in TFIID subunit, NTD2 domain PF04494     7
## 793                             WD domain, G-beta repeat PF00400  1474
## 880                             WD domain, G-beta repeat PF00400     1

since overall plot is a bit messy, let’s try with plotly:

# install.packages("plotly")
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
require(htmlwidgets)

df_spec$textlabel <- paste("PFAM-Name: ", df_spec$PFAM_Name, "\n",
                           "PFAM-Desc.: ", df_spec$PFAM_desc, "\n",
                           "PFAM-Acce.: ", df_spec$model)
# # Pure plotly, no jitter
# p <- plot_ly(data = df_spec, x = ~Superkingdom, y = ~rank, color = ~PFAM_desc,
#              type = "scatter", mode = "marker", 
#              colors = morecolors, 
#              text = ~textlabel, hoverinfo = 'text') %>%
#   add_trace(y =~PFAM_desc, mode = "lines") %>%
#    layout(xaxis = list(side ="top"))
# htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_overall", ".html"))

# Using ggplotly, with jitter
pd <- position_dodge(0.4)
p <- ggplot(df_spec, aes(x = Superkingdom, y = rank, color = PFAM_desc, text = textlabel))+
  geom_point(position = pd)+
  geom_line(aes(group = PFAM_desc), 
            position = pd)+
  scale_x_discrete(position = "top")+
  scale_color_manual(values = morecolors2)+
  labs(x = "Superkingdom",
       y = "Rank of count",
       color = "PFAM Family")+
  theme(legend.position="right",
        plot.margin = margin(l = 0, r=0))+
  guides(color=guide_legend(nrow=20, byrow=TRUE))
p <- beautifier(p)
p <- ggplotly(p, tooltip = "text") %>%
   layout(xaxis = list(side ="top",
                       tickangle=45))

if( save) {
  htmlwidgets::saveWidget(widget = p, file = paste0(pathImages, "PFAM_overall", ".html"))
}
p
# unload plotly and htmlwidgets, as they mask many objects
detach("package:plotly", unload = TRUE)
detach("package:htmlwidgets", unload = TRUE)
## Warning: 'htmlwidgets' namespace cannot be unloaded:
##   namespace 'htmlwidgets' is imported by 'rgl', 'manipulateWidget' so cannot be unloaded

Add PFAM information from other regions of the protein

TODO: necessary?

# pfam_map2 <- read.delim("/home/matteo/polybox/MSc_ACLS/swissrepeat/data/uniprot-filtered-reviewed_yes.tab", sep="\t", header = TRUE, quote = "")
# 
# path_protID2PFAM <- "data/uniprot-filtered-reviewed_yes.tab"
# load_protID2PFAM <- function(path){
#   path = paste(local_base_path, path, sep=local_path_separator)
#   pfam_map = read.csv(path, sep="\t", header=TRUE, quote="", stringsAsFactors = FALSE)
#   pfam_map = plyr::rename(pfam_map, c("Entry"="ID", 
#                                       "Cross.reference..Pfam."="PFAM_ID")) %>%
#     select(ID, PFAM_ID)
#   return(pfam_map)}
# pfam_map <- load_protID2PFAM(path_protID2PFAM) 
# d_sub %>%
#   left_join(pfam_map, by = "ID")
# str(pfam_map)
# head(pfam_map)
# head(d_sub)
# pfam_map[which(pfam_map$ID == "A0A0C5CJR8"),]
# d_sub[which(d_sub$ID == "A0A0C5CJR8"),]